From 7aabbf22f0a55370638cf1bcd1004376b0d7dad8 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 7 Feb 2025 10:34:16 -0800 Subject: [PATCH 001/293] [llvm][ELF] Separate out .dwo bytes written in stats (#126165) So we can distinguish between debug info sections written to .dwo files and those written to the object file. --- llvm/lib/MC/ELFObjectWriter.cpp | 9 +++++++-- llvm/test/CodeGen/X86/dwo-stats.ll | 30 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/X86/dwo-stats.ll diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 5f586fe19a5bb..68e7f1785fa23 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -71,9 +71,12 @@ STATISTIC(StrtabBytes, "Total size of SHT_STRTAB sections"); STATISTIC(SymtabBytes, "Total size of SHT_SYMTAB sections"); STATISTIC(RelocationBytes, "Total size of relocation sections"); STATISTIC(DynsymBytes, "Total size of SHT_DYNSYM sections"); -STATISTIC(DebugBytes, "Total size of debug info sections"); +STATISTIC( + DebugBytes, + "Total size of debug info sections (not including those written to .dwo)"); STATISTIC(UnwindBytes, "Total size of unwind sections"); STATISTIC(OtherBytes, "Total size of uncategorized sections"); +STATISTIC(DwoBytes, "Total size of sections written to .dwo file"); } // namespace stats @@ -969,7 +972,9 @@ void ELFWriter::writeSectionHeaders(const MCAssembler &Asm) { return Section->getFlags() & Flag; }; - if (Section->getName().starts_with(".debug")) { + if (Mode == DwoOnly) { + stats::DwoBytes += Size; + } else if (Section->getName().starts_with(".debug")) { stats::DebugBytes += Size; } else if (Section->getName().starts_with(".eh_frame")) { stats::UnwindBytes += Size; diff --git a/llvm/test/CodeGen/X86/dwo-stats.ll b/llvm/test/CodeGen/X86/dwo-stats.ll new file mode 100644 index 0000000000000..fccfd55029c8b --- /dev/null +++ b/llvm/test/CodeGen/X86/dwo-stats.ll @@ -0,0 +1,30 @@ +; REQUIRES: asserts +; RUN: llc %s -mtriple=x86_64-linux --split-dwarf-file=%t.dwo --split-dwarf-output=%t.dwo --filetype=obj -o /dev/null -stats 2>&1 | FileCheck %s --check-prefixes=SPLIT,CHECK +; RUN: llc %s -mtriple=x86_64-linux --filetype=obj -o /dev/null -stats 2>&1 | FileCheck %s --check-prefixes=NOTSPLIT,CHECK + +; NOTSPLIT-NOT: {{[0-9]+}} elf-object-writer - Total size of sections written to .dwo file +; CHECK-DAG: {{[0-9]+}} elf-object-writer - Total size of debug info sections +; SPLIT-DAG: {{[0-9]+}} elf-object-writer - Total size of sections written to .dwo file +; NOTSPLIT-NOT: {{[0-9]+}} elf-object-writer - Total size of sections written to .dwo file + +define void @banana() !dbg !8 { + ret void, !dbg !12 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5, !6} +!llvm.ident = !{!7} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.1", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "test.dwo", emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: GNU) +!1 = !DIFile(filename: "/tmp/test.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{i32 7, !"PIC Level", i32 2} +!7 = !{!"clang version 11.0.1"} +!8 = distinct !DISubprogram(name: "banana", scope: !9, file: !9, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!9 = !DIFile(filename: "test.c", directory: "/tmp") +!10 = !DISubroutineType(types: !11) +!11 = !{null} +!12 = !DILocation(line: 1, column: 20, scope: !8) From 5a0075adbb623c8661862b9af1272b8f430d9e5c Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Fri, 7 Feb 2025 10:42:55 -0800 Subject: [PATCH 002/293] [mlir][Vector] Generate poison vectors in vector.shape_cast lowering (#125613) This is the first PR that introduces `ub.poison` vectors as part of a rewrite/conversion pattern in the Vector dialect. It replaces the `arith.constant dense<0>` vector initialization for `vector.insert_slice` ops with a poison vector. This PR depends on all the previous PRs that introduced support for poison in Vector operations such as `vector.shuffle`, `vector.extract`, `vector.insert`, including ODS, canonicalization and lowering support. This PR may improve end-to-end compilation time through LLVM, depending on the workloads. --- .../Transforms/LowerVectorShapeCast.cpp | 15 +++---- .../ConvertToSPIRV/vector-unroll.mlir | 8 ++-- ...tract-to-matrix-intrinsics-transforms.mlir | 12 +++--- ...-shape-cast-lowering-scalable-vectors.mlir | 41 +++++++++---------- ...vector-shape-cast-lowering-transforms.mlir | 40 +++++++++--------- 5 files changed, 55 insertions(+), 61 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp index 239dc9aa1de6f..9c1e5fcee91de 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/UB//IR/UBOps.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" @@ -73,8 +73,7 @@ class ShapeCastOpNDDownCastRewritePattern SmallVector srcIdx(srcRank - 1, 0); SmallVector resIdx(resRank, 0); int64_t extractSize = sourceVectorType.getShape().back(); - Value result = rewriter.create( - loc, resultVectorType, rewriter.getZeroAttr(resultVectorType)); + Value result = rewriter.create(loc, resultVectorType); // Compute the indices of each 1-D vector element of the source extraction // and destination slice insertion and generate such instructions. @@ -129,8 +128,7 @@ class ShapeCastOpNDUpCastRewritePattern SmallVector srcIdx(srcRank, 0); SmallVector resIdx(resRank - 1, 0); int64_t extractSize = resultVectorType.getShape().back(); - Value result = rewriter.create( - loc, resultVectorType, rewriter.getZeroAttr(resultVectorType)); + Value result = rewriter.create(loc, resultVectorType); for (int64_t i = 0; i < numElts; ++i) { if (i != 0) { incIdx(srcIdx, sourceVectorType, /*step=*/extractSize); @@ -184,8 +182,7 @@ class ShapeCastOpRewritePattern : public OpRewritePattern { // within the source and result shape. SmallVector srcIdx(srcRank, 0); SmallVector resIdx(resRank, 0); - Value result = rewriter.create( - loc, resultVectorType, rewriter.getZeroAttr(resultVectorType)); + Value result = rewriter.create(loc, resultVectorType); for (int64_t i = 0; i < numElts; i++) { if (i != 0) { incIdx(srcIdx, sourceVectorType); @@ -291,9 +288,7 @@ class ScalableShapeCastOpRewritePattern auto extractionVectorType = VectorType::get( {minExtractionSize}, sourceVectorType.getElementType(), {true}); - Value result = rewriter.create( - loc, resultVectorType, rewriter.getZeroAttr(resultVectorType)); - + Value result = rewriter.create(loc, resultVectorType); SmallVector srcIdx(srcRank, 0); SmallVector resIdx(resRank, 0); diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir index 043f9422d8790..d68ba44ee8840 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/vector-unroll.mlir @@ -83,17 +83,17 @@ func.func @vaddi_reduction(%arg0 : vector<8xi32>, %arg1 : vector<8xi32>) -> (i32 // CHECK-LABEL: @transpose // CHECK-SAME: (%[[ARG0:.+]]: vector<3xi32>, %[[ARG1:.+]]: vector<3xi32>) func.func @transpose(%arg0 : vector<2x3xi32>) -> (vector<3x2xi32>) { - // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<2xi32> + // CHECK: %[[UB:.*]] = ub.poison : vector<2xi32> // CHECK: %[[EXTRACT0:.*]] = vector.extract %[[ARG0]][0] : i32 from vector<3xi32> - // CHECK: %[[INSERT0:.*]]= vector.insert %[[EXTRACT0]], %[[CST]] [0] : i32 into vector<2xi32> + // CHECK: %[[INSERT0:.*]]= vector.insert %[[EXTRACT0]], %[[UB]] [0] : i32 into vector<2xi32> // CHECK: %[[EXTRACT1:.*]] = vector.extract %[[ARG1]][0] : i32 from vector<3xi32> // CHECK: %[[INSERT1:.*]] = vector.insert %[[EXTRACT1]], %[[INSERT0]][1] : i32 into vector<2xi32> // CHECK: %[[EXTRACT2:.*]] = vector.extract %[[ARG0]][1] : i32 from vector<3xi32> - // CHECK: %[[INSERT2:.*]] = vector.insert %[[EXTRACT2]], %[[CST]] [0] : i32 into vector<2xi32> + // CHECK: %[[INSERT2:.*]] = vector.insert %[[EXTRACT2]], %[[UB]] [0] : i32 into vector<2xi32> // CHECK: %[[EXTRACT3:.*]] = vector.extract %[[ARG1]][1] : i32 from vector<3xi32> // CHECK: %[[INSERT3:.*]] = vector.insert %[[EXTRACT3]], %[[INSERT2]] [1] : i32 into vector<2xi32> // CHECK: %[[EXTRACT4:.*]] = vector.extract %[[ARG0]][2] : i32 from vector<3xi32> - // CHECK: %[[INSERT4:.*]] = vector.insert %[[EXTRACT4]], %[[CST]] [0] : i32 into vector<2xi32> + // CHECK: %[[INSERT4:.*]] = vector.insert %[[EXTRACT4]], %[[UB]] [0] : i32 into vector<2xi32> // CHECK: %[[EXTRACT5:.*]] = vector.extract %[[ARG1]][2] : i32 from vector<3xi32> // CHECK: %[[INSERT5:.*]] = vector.insert %[[EXTRACT5]], %[[INSERT4]] [1] : i32 into vector<2xi32> // CHECK: return %[[INSERT1]], %[[INSERT3]], %[[INSERT5]] : vector<2xi32>, vector<2xi32>, vector<2xi32> diff --git a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir index 4867a416e5d14..08ac2ac5bb7d5 100644 --- a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir @@ -14,15 +14,15 @@ // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>, // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x3xf32>, // CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32> -// CHECK-DAG: %[[vcst:.*]] = arith.constant dense<0.000000e+00> : vector<8xf32> -// CHECK-DAG: %[[vcst_0:.*]] = arith.constant dense<0.000000e+00> : vector<12xf32> -// CHECK-DAG: %[[vcst_1:.*]] = arith.constant dense<0.000000e+00> : vector<2x3xf32> +// CHECK-DAG: %[[ub:.*]] = ub.poison : vector<8xf32> +// CHECK-DAG: %[[ub_0:.*]] = ub.poison : vector<12xf32> +// CHECK-DAG: %[[ub_1:.*]] = ub.poison : vector<2x3xf32> // CHECK: %[[a0:.*]] = vector.extract %[[A]][0] : vector<4xf32> from vector<2x4xf32> -// CHECK: %[[a1:.*]] = vector.insert_strided_slice %[[a0]], %[[vcst]] {offsets = [0], strides = [1]} : vector<4xf32> into vector<8xf32> +// CHECK: %[[a1:.*]] = vector.insert_strided_slice %[[a0]], %[[ub]] {offsets = [0], strides = [1]} : vector<4xf32> into vector<8xf32> // CHECK: %[[a2:.*]] = vector.extract %[[A]][1] : vector<4xf32> from vector<2x4xf32> // CHECK: %[[a3:.*]] = vector.insert_strided_slice %[[a2]], %[[a1]] {offsets = [4], strides = [1]} : vector<4xf32> into vector<8xf32> // CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<3xf32> from vector<4x3xf32> -// CHECK: %[[b1:.*]] = vector.insert_strided_slice %[[b0]], %[[vcst_0]] {offsets = [0], strides = [1]} : vector<3xf32> into vector<12xf32> +// CHECK: %[[b1:.*]] = vector.insert_strided_slice %[[b0]], %[[ub_0]] {offsets = [0], strides = [1]} : vector<3xf32> into vector<12xf32> // CHECK: %[[b2:.*]] = vector.extract %[[B]][1] : vector<3xf32> from vector<4x3xf32> // CHECK: %[[b3:.*]] = vector.insert_strided_slice %[[b2]], %[[b1]] {offsets = [3], strides = [1]} : vector<3xf32> into vector<12xf32> // CHECK: %[[b4:.*]] = vector.extract %[[B]][2] : vector<3xf32> from vector<4x3xf32> @@ -31,7 +31,7 @@ // CHECK: %[[b7:.*]] = vector.insert_strided_slice %[[b6]], %[[b5]] {offsets = [9], strides = [1]} : vector<3xf32> into vector<12xf32> // CHECK: %[[mm1:.*]] = vector.matrix_multiply %[[a3]], %[[b7]] {lhs_columns = 4 : i32, lhs_rows = 2 : i32, rhs_columns = 3 : i32} : (vector<8xf32>, vector<12xf32>) -> vector<6xf32> // CHECK: %[[mm2:.*]] = vector.extract_strided_slice %[[mm1]] {offsets = [0], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32> -// CHECK: %[[mm3:.*]] = vector.insert %[[mm2]], %[[vcst_1]] [0] : vector<3xf32> into vector<2x3xf32> +// CHECK: %[[mm3:.*]] = vector.insert %[[mm2]], %[[ub_1]] [0] : vector<3xf32> into vector<2x3xf32> // CHECK: %[[mm4:.*]] = vector.extract_strided_slice %[[mm1]] {offsets = [3], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32> // CHECK: %[[mm5:.*]] = vector.insert %[[mm4]], %[[mm3]] [1] : vector<3xf32> into vector<2x3xf32> // CHECK: %[[mm6:.*]] = arith.addf %[[C]], %[[mm5]] : vector<2x3xf32> diff --git a/mlir/test/Dialect/Vector/vector-shape-cast-lowering-scalable-vectors.mlir b/mlir/test/Dialect/Vector/vector-shape-cast-lowering-scalable-vectors.mlir index fde6ce9102446..f4becad3c79c1 100644 --- a/mlir/test/Dialect/Vector/vector-shape-cast-lowering-scalable-vectors.mlir +++ b/mlir/test/Dialect/Vector/vector-shape-cast-lowering-scalable-vectors.mlir @@ -7,9 +7,9 @@ // CHECK-SAME: %[[arg0:.*]]: vector<2x1x[4]xi32> func.func @i32_3d_to_1d_last_dim_scalable(%arg0: vector<2x1x[4]xi32>) -> vector<[8]xi32> { - // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<0> : vector<[8]xi32> + // CHECK-NEXT: %[[ub:.*]] = ub.poison : vector<[8]xi32> // CHECK-NEXT: %[[subvec0:.*]] = vector.extract %[[arg0]][0, 0] : vector<[4]xi32> from vector<2x1x[4]xi32> - // CHECK-NEXT: %[[res0:.*]] = vector.scalable.insert %[[subvec0]], %[[cst]][0] : vector<[4]xi32> into vector<[8]xi32> + // CHECK-NEXT: %[[res0:.*]] = vector.scalable.insert %[[subvec0]], %[[ub]][0] : vector<[4]xi32> into vector<[8]xi32> // CHECK-NEXT: %[[subvec1:.*]] = vector.extract %[[arg0]][1, 0] : vector<[4]xi32> from vector<2x1x[4]xi32> // CHECK-NEXT: %[[res1:.*]] = vector.scalable.insert %[[subvec1]], %[[res0]][4] : vector<[4]xi32> into vector<[8]xi32> %flat = vector.shape_cast %arg0 : vector<2x1x[4]xi32> to vector<[8]xi32> @@ -22,9 +22,9 @@ func.func @i32_3d_to_1d_last_dim_scalable(%arg0: vector<2x1x[4]xi32>) -> vector< // CHECK-LABEL: i32_1d_to_3d_last_dim_scalable // CHECK-SAME: %[[arg0:.*]]: vector<[8]xi32> func.func @i32_1d_to_3d_last_dim_scalable(%arg0: vector<[8]xi32>) -> vector<2x1x[4]xi32> { - // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<0> : vector<2x1x[4]xi32> + // CHECK-NEXT: %[[ub:.*]] = ub.poison : vector<2x1x[4]xi32> // CHECK-NEXT: %[[subvec0:.*]] = vector.scalable.extract %[[arg0]][0] : vector<[4]xi32> from vector<[8]xi32> - // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[cst]] [0, 0] : vector<[4]xi32> into vector<2x1x[4]xi32> + // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[ub]] [0, 0] : vector<[4]xi32> into vector<2x1x[4]xi32> // CHECK-NEXT: %[[subvec1:.*]] = vector.scalable.extract %[[arg0]][4] : vector<[4]xi32> from vector<[8]xi32> // CHECK-NEXT: %[[res1:.*]] = vector.insert %[[subvec1]], %[[res0]] [1, 0] : vector<[4]xi32> into vector<2x1x[4]xi32> %unflat = vector.shape_cast %arg0 : vector<[8]xi32> to vector<2x1x[4]xi32> @@ -37,9 +37,9 @@ func.func @i32_1d_to_3d_last_dim_scalable(%arg0: vector<[8]xi32>) -> vector<2x1x // CHECK-LABEL: i8_2d_to_1d_last_dim_scalable // CHECK-SAME: %[[arg0:.*]]: vector<4x[8]xi8> func.func @i8_2d_to_1d_last_dim_scalable(%arg0: vector<4x[8]xi8>) -> vector<[32]xi8> { - // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<0> : vector<[32]xi8> + // CHECK-NEXT: %[[ub:.*]] = ub.poison : vector<[32]xi8> // CHECK-NEXT: %[[subvec0:.*]] = vector.extract %[[arg0]][0] : vector<[8]xi8> from vector<4x[8]xi8> - // CHECK-NEXT: %[[res0:.*]] = vector.scalable.insert %[[subvec0]], %[[cst]][0] : vector<[8]xi8> into vector<[32]xi8> + // CHECK-NEXT: %[[res0:.*]] = vector.scalable.insert %[[subvec0]], %[[ub]][0] : vector<[8]xi8> into vector<[32]xi8> // CHECK-NEXT: %[[subvec1:.*]] = vector.extract %[[arg0]][1] : vector<[8]xi8> from vector<4x[8]xi8> // CHECK-NEXT: %[[res1:.*]] = vector.scalable.insert %[[subvec1]], %[[res0]][8] : vector<[8]xi8> into vector<[32]xi8> // CHECK-NEXT: %[[subvec2:.*]] = vector.extract %[[arg0]][2] : vector<[8]xi8> from vector<4x[8]xi8> @@ -56,9 +56,9 @@ func.func @i8_2d_to_1d_last_dim_scalable(%arg0: vector<4x[8]xi8>) -> vector<[32] // CHECK-LABEL: i8_1d_to_2d_last_dim_scalable // CHECK-SAME: %[[arg0:.*]]: vector<[32]xi8> func.func @i8_1d_to_2d_last_dim_scalable(%arg0: vector<[32]xi8>) -> vector<4x[8]xi8> { - // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<0> : vector<4x[8]xi8> + // CHECK-NEXT: %[[ub:.*]] = ub.poison : vector<4x[8]xi8> // CHECK-NEXT: %[[subvec0:.*]] = vector.scalable.extract %[[arg0]][0] : vector<[8]xi8> from vector<[32]xi8> - // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[cst]] [0] : vector<[8]xi8> into vector<4x[8]xi8> + // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[ub]] [0] : vector<[8]xi8> into vector<4x[8]xi8> // CHECK-NEXT: %[[subvec1:.*]] = vector.scalable.extract %[[arg0]][8] : vector<[8]xi8> from vector<[32]xi8> // CHECK-NEXT: %[[res1:.*]] = vector.insert %[[subvec1]], %[[res0]] [1] : vector<[8]xi8> into vector<4x[8]xi8> // CHECK-NEXT: %[[subvec2:.*]] = vector.scalable.extract %[[arg0]][16] : vector<[8]xi8> from vector<[32]xi8> @@ -75,9 +75,9 @@ func.func @i8_1d_to_2d_last_dim_scalable(%arg0: vector<[32]xi8>) -> vector<4x[8] // CHECK-LABEL: f32_permute_leading_non_scalable_dims // CHECK-SAME: %[[arg0:.*]]: vector<2x3x[4]xf32> func.func @f32_permute_leading_non_scalable_dims(%arg0: vector<2x3x[4]xf32>) -> vector<3x2x[4]xf32> { - // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<3x2x[4]xf32> + // CHECK-NEXT: %[[ub:.*]] = ub.poison : vector<3x2x[4]xf32> // CHECK-NEXT: %[[subvec0:.*]] = vector.extract %[[arg0]][0, 0] : vector<[4]xf32> from vector<2x3x[4]xf32> - // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[cst]] [0, 0] : vector<[4]xf32> into vector<3x2x[4]xf32> + // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[ub]] [0, 0] : vector<[4]xf32> into vector<3x2x[4]xf32> // CHECK-NEXT: %[[subvec1:.*]] = vector.extract %[[arg0]][0, 1] : vector<[4]xf32> from vector<2x3x[4]xf32> // CHECK-NEXT: %[[res1:.*]] = vector.insert %[[subvec1]], %[[res0]] [0, 1] : vector<[4]xf32> into vector<3x2x[4]xf32> // CHECK-NEXT: %[[subvec2:.*]] = vector.extract %[[arg0]][0, 2] : vector<[4]xf32> from vector<2x3x[4]xf32> @@ -99,9 +99,9 @@ func.func @f32_permute_leading_non_scalable_dims(%arg0: vector<2x3x[4]xf32>) -> // CHECK-SAME: %[[arg0:.*]]: vector<2x2x[2]xf64> func.func @f64_flatten_leading_non_scalable_dims(%arg0: vector<2x2x[2]xf64>) -> vector<4x[2]xf64> { - // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<4x[2]xf64> + // CHECK-NEXT: %[[ub:.*]] = ub.poison : vector<4x[2]xf64> // CHECK-NEXT: %[[subvec0:.*]] = vector.extract %[[arg0]][0, 0] : vector<[2]xf64> from vector<2x2x[2]xf64> - // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[cst]] [0] : vector<[2]xf64> into vector<4x[2]xf64> + // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[ub]] [0] : vector<[2]xf64> into vector<4x[2]xf64> // CHECK-NEXT: %[[subvec1:.*]] = vector.extract %[[arg0]][0, 1] : vector<[2]xf64> from vector<2x2x[2]xf64> // CHECK-NEXT: %[[res1:.*]] = vector.insert %[[subvec1]], %[[res0]] [1] : vector<[2]xf64> into vector<4x[2]xf64> // CHECK-NEXT: %[[subvec2:.*]] = vector.extract %[[arg0]][1, 0] : vector<[2]xf64> from vector<2x2x[2]xf64> @@ -109,7 +109,7 @@ func.func @f64_flatten_leading_non_scalable_dims(%arg0: vector<2x2x[2]xf64>) -> // CHECK-NEXT: %[[subvec3:.*]] = vector.extract %[[arg0]][1, 1] : vector<[2]xf64> from vector<2x2x[2]xf64> // CHECK-NEXT: %[[res3:.*]] = vector.insert %[[subvec3]], %[[res2]] [3] : vector<[2]xf64> into vector<4x[2]xf64> %res = vector.shape_cast %arg0: vector<2x2x[2]xf64> to vector<4x[2]xf64> - // CHECK-NEXT: return %7 : vector<4x[2]xf64> + // CHECK-NEXT: return %[[res3:.*]] : vector<4x[2]xf64> return %res : vector<4x[2]xf64> } @@ -119,10 +119,10 @@ func.func @f64_flatten_leading_non_scalable_dims(%arg0: vector<2x2x[2]xf64>) -> // CHECK-SAME: %[[arg0:.*]]: vector<3x[4]xf32> func.func @f32_reduce_trailing_scalable_dim(%arg0: vector<3x[4]xf32>) -> vector<6x[2]xf32> { - // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<6x[2]xf32> + // CHECK-NEXT: %[[ub:.*]] = ub.poison : vector<6x[2]xf32> // CHECK-NEXT: %[[srcvec0:.*]] = vector.extract %[[arg0]][0] : vector<[4]xf32> from vector<3x[4]xf32> // CHECK-NEXT: %[[subvec0:.*]] = vector.scalable.extract %[[srcvec0]][0] : vector<[2]xf32> from vector<[4]xf32> - // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[cst]] [0] : vector<[2]xf32> into vector<6x[2]xf32> + // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[subvec0]], %[[ub]] [0] : vector<[2]xf32> into vector<6x[2]xf32> // CHECK-NEXT: %[[subvec1:.*]] = vector.scalable.extract %[[srcvec0]][2] : vector<[2]xf32> from vector<[4]xf32> // CHECK-NEXT: %[[res1:.*]] = vector.insert %[[subvec1]], %[[res0]] [1] : vector<[2]xf32> into vector<6x[2]xf32> // CHECK-NEXT: %[[srcvec1:.*]] = vector.extract %[[arg0]][1] : vector<[4]xf32> from vector<3x[4]xf32> @@ -146,16 +146,15 @@ func.func @f32_reduce_trailing_scalable_dim(%arg0: vector<3x[4]xf32>) -> vector< // CHECK-SAME: %[[arg0:.*]]: vector<4x[2]xf32> func.func @f32_increase_trailing_scalable_dim(%arg0: vector<4x[2]xf32>) -> vector<2x[4]xf32> { - // CHECK-NEXT: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<2x[4]xf32> + // CHECK-DAG: %[[ub0:.*]] = ub.poison : vector<2x[4]xf32> + // CHECK-DAG: %[[ub1:.*]] = ub.poison : vector<[4]xf32> // CHECK-NEXT: %[[subvec0:.*]] = vector.extract %[[arg0]][0] : vector<[2]xf32> from vector<4x[2]xf32> - // CHECK-NEXT: %[[resvec0:.*]] = vector.extract %[[cst]][0] : vector<[4]xf32> from vector<2x[4]xf32> - // CHECK-NEXT: %[[resvec1:.*]] = vector.scalable.insert %[[subvec0]], %[[resvec0]][0] : vector<[2]xf32> into vector<[4]xf32> + // CHECK-NEXT: %[[resvec1:.*]] = vector.scalable.insert %[[subvec0]], %[[ub1]][0] : vector<[2]xf32> into vector<[4]xf32> // CHECK-NEXT: %[[subvec1:.*]] = vector.extract %[[arg0]][1] : vector<[2]xf32> from vector<4x[2]xf32> // CHECK-NEXT: %[[resvec2:.*]] = vector.scalable.insert %[[subvec1]], %[[resvec1]][2] : vector<[2]xf32> into vector<[4]xf32> - // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[resvec2]], %[[cst]] [0] : vector<[4]xf32> into vector<2x[4]xf32> + // CHECK-NEXT: %[[res0:.*]] = vector.insert %[[resvec2]], %[[ub0]] [0] : vector<[4]xf32> into vector<2x[4]xf32> // CHECK-NEXT: %[[subvec3:.*]] = vector.extract %[[arg0]][2] : vector<[2]xf32> from vector<4x[2]xf32> - // CHECK-NEXT: %[[resvec3:.*]] = vector.extract %[[cst]][1] : vector<[4]xf32> from vector<2x[4]xf32> - // CHECK-NEXT: %[[resvec4:.*]] = vector.scalable.insert %[[subvec3]], %[[resvec3]][0] : vector<[2]xf32> into vector<[4]xf32> + // CHECK-NEXT: %[[resvec4:.*]] = vector.scalable.insert %[[subvec3]], %[[ub1]][0] : vector<[2]xf32> into vector<[4]xf32> // CHECK-NEXT: %[[subvec4:.*]] = vector.extract %[[arg0]][3] : vector<[2]xf32> from vector<4x[2]xf32> // CHECK-NEXT: %[[resvec5:.*]] = vector.scalable.insert %[[subvec4]], %[[resvec4]][2] : vector<[2]xf32> into vector<[4]xf32> // CHECK-NEXT: %[[res1:.*]] = vector.insert %[[resvec5]], %[[res0]] [1] : vector<[4]xf32> into vector<2x[4]xf32> diff --git a/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir index b4c52d5533116..ab30acf68b30b 100644 --- a/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir @@ -22,11 +22,11 @@ func.func @cancel_shape_cast(%arg0: vector<16xf32>) -> vector<16xf32> { // llvm.matrix operations // CHECK-LABEL: func @shape_casts func.func @shape_casts(%a: vector<2x2xf32>) -> (vector<4xf32>, vector<2x2xf32>) { - // CHECK-DAG: %[[cst22:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf32> - // CHECK-DAG: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> + // CHECK-DAG: %[[ub22:.*]] = ub.poison : vector<2x2xf32> + // CHECK-DAG: %[[ub:.*]] = ub.poison : vector<4xf32> // CHECK: %[[ex0:.*]] = vector.extract %{{.*}}[0] : vector<2xf32> from vector<2x2xf32> // - // CHECK: %[[in0:.*]] = vector.insert_strided_slice %[[ex0]], %[[cst]] + // CHECK: %[[in0:.*]] = vector.insert_strided_slice %[[ex0]], %[[ub]] // CHECK-SAME: {offsets = [0], strides = [1]} : vector<2xf32> into vector<4xf32> // // CHECK: %[[ex1:.*]] = vector.extract %{{.*}}[1] : vector<2xf32> from vector<2x2xf32> @@ -42,7 +42,7 @@ func.func @shape_casts(%a: vector<2x2xf32>) -> (vector<4xf32>, vector<2x2xf32>) // CHECK-SAME: {offsets = [0], sizes = [2], strides = [1]} : // CHECK-SAME: vector<4xf32> to vector<2xf32> // - // CHECK: %[[res0:.*]] = vector.insert %[[ss0]], %[[cst22]] [0] : + // CHECK: %[[res0:.*]] = vector.insert %[[ss0]], %[[ub22]] [0] : // CHECK-SAME: vector<2xf32> into vector<2x2xf32> // // CHECK: %[[s2:.*]] = vector.extract_strided_slice %[[add]] @@ -59,9 +59,9 @@ func.func @shape_casts(%a: vector<2x2xf32>) -> (vector<4xf32>, vector<2x2xf32>) // CHECK-LABEL: func @shape_cast_2d2d // CHECK-SAME: %[[A:.*]]: vector<3x2xf32> -// CHECK: %[[C:.*]] = arith.constant dense<0.000000e+00> : vector<2x3xf32> +// CHECK: %[[UB:.*]] = ub.poison : vector<2x3xf32> // CHECK: %[[T0:.*]] = vector.extract %[[A]][0, 0] : f32 from vector<3x2xf32> -// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[C]] [0, 0] : f32 into vector<2x3xf32> +// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[UB]] [0, 0] : f32 into vector<2x3xf32> // CHECK: %[[T2:.*]] = vector.extract %[[A]][0, 1] : f32 from vector<3x2xf32> // CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[T1]] [0, 1] : f32 into vector<2x3xf32> // CHECK: %[[T4:.*]] = vector.extract %[[A]][1, 0] : f32 from vector<3x2xf32> @@ -81,9 +81,9 @@ func.func @shape_cast_2d2d(%arg0 : vector<3x2xf32>) -> vector<2x3xf32> { // CHECK-LABEL: func @shape_cast_3d1d // CHECK-SAME: %[[A:.*]]: vector<1x3x2xf32> -// CHECK: %[[C:.*]] = arith.constant dense<0.000000e+00> : vector<6xf32> +// CHECK: %[[UB:.*]] = ub.poison : vector<6xf32> // CHECK: %[[T0:.*]] = vector.extract %[[A]][0, 0] : vector<2xf32> from vector<1x3x2xf32> -// CHECK: %[[T1:.*]] = vector.insert_strided_slice %[[T0]], %[[C]] +// CHECK: %[[T1:.*]] = vector.insert_strided_slice %[[T0]], %[[UB]] // CHECK-SAME: {offsets = [0], strides = [1]} : vector<2xf32> into vector<6xf32> // CHECK: %[[T2:.*]] = vector.extract %[[A]][0, 1] : vector<2xf32> from vector<1x3x2xf32> // CHECK: %[[T3:.*]] = vector.insert_strided_slice %[[T2]], %[[T1]] @@ -100,10 +100,10 @@ func.func @shape_cast_3d1d(%arg0 : vector<1x3x2xf32>) -> vector<6xf32> { // CHECK-LABEL: func @shape_cast_1d3d // CHECK-SAME: %[[A:.*]]: vector<6xf32> -// CHECK: %[[C:.*]] = arith.constant dense<0.000000e+00> : vector<2x1x3xf32> +// CHECK: %[[UB:.*]] = ub.poison : vector<2x1x3xf32> // CHECK: %[[T0:.*]] = vector.extract_strided_slice %[[A]] // CHECK-SAME: {offsets = [0], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32> -// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[C]] [0, 0] : vector<3xf32> into vector<2x1x3xf32> +// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[UB]] [0, 0] : vector<3xf32> into vector<2x1x3xf32> // CHECK: %[[T2:.*]] = vector.extract_strided_slice %[[A]] // CHECK: {offsets = [3], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32> // CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[T1]] [1, 0] : vector<3xf32> into vector<2x1x3xf32> @@ -115,11 +115,11 @@ func.func @shape_cast_1d3d(%arg0 : vector<6xf32>) -> vector<2x1x3xf32> { } // CHECK-LABEL: func.func @shape_cast_0d1d( -// CHECK-SAME: %[[VAL_0:.*]]: vector) -> vector<1xf32> { -// CHECK: %[[VAL_1:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32> -// CHECK: %[[VAL_2:.*]] = vector.extractelement %[[VAL_0]][] : vector -// CHECK: %[[VAL_3:.*]] = vector.insert %[[VAL_2]], %[[VAL_1]] [0] : f32 into vector<1xf32> -// CHECK: return %[[VAL_3]] : vector<1xf32> +// CHECK-SAME: %[[ARG0:.*]]: vector) -> vector<1xf32> { +// CHECK: %[[UB:.*]] = ub.poison : vector<1xf32> +// CHECK: %[[EXTRACT0:.*]] = vector.extractelement %[[ARG0]][] : vector +// CHECK: %[[RES:.*]] = vector.insert %[[EXTRACT0]], %[[UB]] [0] : f32 into vector<1xf32> +// CHECK: return %[[RES]] : vector<1xf32> // CHECK: } func.func @shape_cast_0d1d(%arg0 : vector) -> vector<1xf32> { @@ -128,11 +128,11 @@ func.func @shape_cast_0d1d(%arg0 : vector) -> vector<1xf32> { } // CHECK-LABEL: func.func @shape_cast_1d0d( -// CHECK-SAME: %[[VAL_0:.*]]: vector<1xf32>) -> vector { -// CHECK: %[[VAL_1:.*]] = arith.constant dense<0.000000e+00> : vector -// CHECK: %[[VAL_2:.*]] = vector.extract %[[VAL_0]][0] : f32 from vector<1xf32> -// CHECK: %[[VAL_3:.*]] = vector.insertelement %[[VAL_2]], %[[VAL_1]][] : vector -// CHECK: return %[[VAL_3]] : vector +// CHECK-SAME: %[[ARG0:.*]]: vector<1xf32>) -> vector { +// CHECK: %[[UB:.*]] = ub.poison : vector +// CHECK: %[[EXTRACT0:.*]] = vector.extract %[[ARG0]][0] : f32 from vector<1xf32> +// CHECK: %[[RES:.*]] = vector.insertelement %[[EXTRACT0]], %[[UB]][] : vector +// CHECK: return %[[RES]] : vector // CHECK: } func.func @shape_cast_1d0d(%arg0 : vector<1xf32>) -> vector { From e566313a1fac1b290c98454cc52b485ae4f644c5 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 7 Feb 2025 10:43:56 -0800 Subject: [PATCH 003/293] [libc][signal] clean up usage of sighandler_t (#125745) `man 3 signal`'s declaration has a face _only a mother could love_. sighandler_t and __sighandler_t are not defined in the C standard, or POSIX. They are helpful typedefs provided by glibc and the Linux kernel UAPI headers respectively since working with function pointers' syntax can be painful. But we should not rely on them; in C++ we have `auto*` and `using` statements. Remove the proxy header, and only include a typedef for sighandler_t when targeting Linux, for compatibility with glibc. Fixes: #125598 --- libc/hdr/types/CMakeLists.txt | 9 ------- libc/hdr/types/sighandler_t.h | 24 ------------------- libc/include/CMakeLists.txt | 7 +++--- .../llvm-libc-macros/gpu/signal-macros.h | 6 ++--- .../llvm-libc-macros/linux/signal-macros.h | 6 ++--- libc/include/llvm-libc-types/CMakeLists.txt | 2 +- .../{__sighandler_t.h => sighandler_t.h} | 13 ++++++---- .../llvm-libc-types/struct_sigaction.h | 2 -- libc/include/signal.yaml | 16 +++++++++---- libc/src/signal/linux/CMakeLists.txt | 1 - libc/src/signal/linux/signal.cpp | 7 ++++-- libc/src/signal/signal.h | 3 +-- libc/test/UnitTest/FPExceptMatcher.cpp | 2 +- libc/test/src/signal/CMakeLists.txt | 1 - libc/test/src/signal/signal_test.cpp | 4 +--- 15 files changed, 38 insertions(+), 65 deletions(-) delete mode 100644 libc/hdr/types/sighandler_t.h rename libc/include/llvm-libc-types/{__sighandler_t.h => sighandler_t.h} (52%) diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index dfc90009ef54a..84a2647ba664d 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -250,15 +250,6 @@ add_proxy_header_library( libc.include.locale ) -add_proxy_header_library( - sighandler_t - HDRS - sighandler_t.h - FULL_BUILD_DEPENDS - libc.include.llvm-libc-types.__sighandler_t - libc.include.signal -) - add_proxy_header_library( stack_t HDRS diff --git a/libc/hdr/types/sighandler_t.h b/libc/hdr/types/sighandler_t.h deleted file mode 100644 index bc40dd8b4c8f4..0000000000000 --- a/libc/hdr/types/sighandler_t.h +++ /dev/null @@ -1,24 +0,0 @@ -//===-- Definition of macros from __sighandler_t.h ------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_HDR_TYPES_SIGHANDLER_T_H -#define LLVM_LIBC_HDR_TYPES_SIGHANDLER_T_H - -#ifdef LIBC_FULL_BUILD - -#include "include/llvm-libc-types/__sighandler_t.h" - -using sighandler_t = __sighandler_t; - -#else // overlay mode - -#include - -#endif // LLVM_LIBC_FULL_BUILD - -#endif // LLVM_LIBC_HDR_TYPES_SIGHANDLER_T_H diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 63745542662d5..867bd1e5ee20f 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -284,13 +284,14 @@ add_header_macro( signal.h DEPENDS .llvm-libc-macros.signal_macros + .llvm-libc-types.pid_t .llvm-libc-types.sig_atomic_t + .llvm-libc-types.sighandler_t + .llvm-libc-types.siginfo_t .llvm-libc-types.sigset_t + .llvm-libc-types.stack_t .llvm-libc-types.struct_sigaction .llvm-libc-types.union_sigval - .llvm-libc-types.siginfo_t - .llvm-libc-types.stack_t - .llvm-libc-types.pid_t ) add_header_macro( diff --git a/libc/include/llvm-libc-macros/gpu/signal-macros.h b/libc/include/llvm-libc-macros/gpu/signal-macros.h index 2d8159240de8b..f0d49ea34fe0e 100644 --- a/libc/include/llvm-libc-macros/gpu/signal-macros.h +++ b/libc/include/llvm-libc-macros/gpu/signal-macros.h @@ -16,9 +16,9 @@ #define SIGSEGV 11 #define SIGTERM 15 -#define SIG_DFL ((__sighandler_t)(0)) -#define SIG_IGN ((__sighandler_t)(1)) -#define SIG_ERR ((__sighandler_t)(-1)) +#define SIG_DFL ((void (*)(int))(0)) +#define SIG_IGN ((void (*)(int))(1)) +#define SIG_ERR ((void (*)(int))(-1)) // Max signal number #define NSIG 64 diff --git a/libc/include/llvm-libc-macros/linux/signal-macros.h b/libc/include/llvm-libc-macros/linux/signal-macros.h index 0b7317ebc9b80..d220241a38206 100644 --- a/libc/include/llvm-libc-macros/linux/signal-macros.h +++ b/libc/include/llvm-libc-macros/linux/signal-macros.h @@ -86,9 +86,9 @@ #error "Signal stack sizes not defined for your platform." #endif -#define SIG_DFL ((__sighandler_t)0) -#define SIG_IGN ((__sighandler_t)1) -#define SIG_ERR ((__sighandler_t)-1) +#define SIG_DFL ((void (*)(int))0) +#define SIG_IGN ((void (*)(int))1) +#define SIG_ERR ((void (*)(int))(-1)) // SIGCHLD si_codes #define CLD_EXITED 1 // child has exited diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index 9e8d2f818d4ed..7ed69ab1af6d9 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -15,7 +15,6 @@ add_header(__pthread_start_t HDR __pthread_start_t.h) add_header(__pthread_tss_dtor_t HDR __pthread_tss_dtor_t.h) add_header(__qsortcompare_t HDR __qsortcompare_t.h) add_header(__qsortrcompare_t HDR __qsortrcompare_t.h) -add_header(__sighandler_t HDR __sighandler_t.h) add_header(__thread_type HDR __thread_type.h) add_header(blkcnt_t HDR blkcnt_t.h) add_header(blksize_t HDR blksize_t.h) @@ -66,6 +65,7 @@ if(LIBC_TYPES_TIME_T_IS_32_BIT) else() add_header(time_t HDR time_t_64.h DEST_HDR time_t.h) endif() +add_header(sighandler_t HDR sighandler_t.h) add_header(stack_t HDR stack_t.h DEPENDS .size_t) add_header(suseconds_t HDR suseconds_t.h) add_header(struct_dirent HDR struct_dirent.h DEPENDS .ino_t .off_t) diff --git a/libc/include/llvm-libc-types/__sighandler_t.h b/libc/include/llvm-libc-types/sighandler_t.h similarity index 52% rename from libc/include/llvm-libc-types/__sighandler_t.h rename to libc/include/llvm-libc-types/sighandler_t.h index 9c1ac997fc4ee..f39ab04685200 100644 --- a/libc/include/llvm-libc-types/__sighandler_t.h +++ b/libc/include/llvm-libc-types/sighandler_t.h @@ -1,4 +1,4 @@ -//===-- Definition of struct __sighandler_t -------------------------------===// +//===-- Definition of sighandler_t ----------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,9 +6,12 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_TYPES___SIGHANDLER_T_H -#define LLVM_LIBC_TYPES___SIGHANDLER_T_H +#ifndef LLVM_LIBC_TYPES_SIGHANDLER_T_H +#define LLVM_LIBC_TYPES_SIGHANDLER_T_H -typedef void (*__sighandler_t)(int); +#ifdef __linux__ +// For compatibility with glibc. +typedef void (*sighandler_t)(int); +#endif -#endif // LLVM_LIBC_TYPES___SIGHANDLER_T_H +#endif // LLVM_LIBC_TYPES_SIGHANDLER_T_H diff --git a/libc/include/llvm-libc-types/struct_sigaction.h b/libc/include/llvm-libc-types/struct_sigaction.h index b4d0c965a4c63..907418b5e0f9a 100644 --- a/libc/include/llvm-libc-types/struct_sigaction.h +++ b/libc/include/llvm-libc-types/struct_sigaction.h @@ -25,6 +25,4 @@ struct sigaction { #endif }; -typedef void (*__sighandler_t)(int); - #endif // LLVM_LIBC_TYPES_STRUCT_SIGACTION_H diff --git a/libc/include/signal.yaml b/libc/include/signal.yaml index 576e77576ac74..6fdd8c97ccbe2 100644 --- a/libc/include/signal.yaml +++ b/libc/include/signal.yaml @@ -3,12 +3,13 @@ header_template: signal.h.def macros: [] types: - type_name: pid_t - - type_name: stack_t + - type_name: sig_atomic_t + - type_name: sighandler_t - type_name: siginfo_t - - type_name: struct_sigaction - type_name: sigset_t + - type_name: stack_t + - type_name: struct_sigaction - type_name: union_sigval - - type_name: sig_atomic_t enums: [] objects: [] functions: @@ -69,10 +70,15 @@ functions: - name: signal standards: - stdc - return_type: __sighandler_t + # May the Geneva Convention have mercy on my soul... Why this insanity? + # Well: signal returns a function pointer to a function with no return + # value and which accepts an int. The parameter list appears on the far + # right of the declaration. i.e. + # void (*signal(int, void (*)(int)))(int); + return_type: void (* arguments: - type: int - - type: __sighandler_t + - type: void (*)(int)))(int - name: sigprocmask standards: - POSIX diff --git a/libc/src/signal/linux/CMakeLists.txt b/libc/src/signal/linux/CMakeLists.txt index f7457d31cf4f8..c0dd61e473881 100644 --- a/libc/src/signal/linux/CMakeLists.txt +++ b/libc/src/signal/linux/CMakeLists.txt @@ -127,7 +127,6 @@ add_entrypoint_object( DEPENDS .sigaction libc.hdr.signal_macros - libc.hdr.types.sighandler_t ) add_entrypoint_object( diff --git a/libc/src/signal/linux/signal.cpp b/libc/src/signal/linux/signal.cpp index 1da0ef8c97a20..7c8ea16c6cd2e 100644 --- a/libc/src/signal/linux/signal.cpp +++ b/libc/src/signal/linux/signal.cpp @@ -8,14 +8,17 @@ #include "src/signal/signal.h" #include "hdr/signal_macros.h" -#include "hdr/types/sighandler_t.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/signal/sigaction.h" namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(sighandler_t, signal, (int signum, sighandler_t handler)) { +// Our LLVM_LIBC_FUNCTION macro doesn't handle function pointer return types. +using signal_handler = void (*)(int); + +LLVM_LIBC_FUNCTION(signal_handler, signal, + (int signum, signal_handler handler)) { struct sigaction action, old; action.sa_handler = handler; action.sa_flags = SA_RESTART; diff --git a/libc/src/signal/signal.h b/libc/src/signal/signal.h index 06e77e11bf0bd..e1f31a8e126c5 100644 --- a/libc/src/signal/signal.h +++ b/libc/src/signal/signal.h @@ -9,12 +9,11 @@ #ifndef LLVM_LIBC_SRC_SIGNAL_SIGNAL_H #define LLVM_LIBC_SRC_SIGNAL_SIGNAL_H -#include "hdr/types/sighandler_t.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { -sighandler_t signal(int signum, sighandler_t handler); +void (*signal(int signum, void (*handler)(int)))(int); } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/UnitTest/FPExceptMatcher.cpp b/libc/test/UnitTest/FPExceptMatcher.cpp index 119a06985b8f1..d66066023984e 100644 --- a/libc/test/UnitTest/FPExceptMatcher.cpp +++ b/libc/test/UnitTest/FPExceptMatcher.cpp @@ -37,7 +37,7 @@ static void sigfpeHandler(int sig) { } FPExceptMatcher::FPExceptMatcher(FunctionCaller *func) { - sighandler_t oldSIGFPEHandler = signal(SIGFPE, &sigfpeHandler); + auto *oldSIGFPEHandler = signal(SIGFPE, &sigfpeHandler); caughtExcept = false; fenv_t oldEnv; diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt index a27f5b8f1000e..f86ce2ae96857 100644 --- a/libc/test/src/signal/CMakeLists.txt +++ b/libc/test/src/signal/CMakeLists.txt @@ -74,7 +74,6 @@ add_libc_unittest( SRCS signal_test.cpp DEPENDS - libc.hdr.types.sighandler_t libc.src.errno.errno libc.src.signal.raise libc.src.signal.signal diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp index 4b57311eee2d8..bac9c3b8b68bb 100644 --- a/libc/test/src/signal/signal_test.cpp +++ b/libc/test/src/signal/signal_test.cpp @@ -13,14 +13,12 @@ #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -#include "hdr/types/sighandler_t.h" - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; TEST(LlvmLibcSignal, Invalid) { LIBC_NAMESPACE::libc_errno = 0; - sighandler_t valid = +[](int) {}; + auto *valid = +[](int) {}; EXPECT_THAT((void *)LIBC_NAMESPACE::signal(0, valid), Fails(EINVAL, (void *)SIG_ERR)); EXPECT_THAT((void *)LIBC_NAMESPACE::signal(65, valid), From 2c4dd89902c3e679607567569651acf8b828360e Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Fri, 7 Feb 2025 10:51:24 -0800 Subject: [PATCH 004/293] [mlir][Vector] Introduce poison in LowerVectorBitCast/Broadcast/Transpose (#126180) This PR continues with the introduction of poison as initialization vector, in this particular case, in LowerVectorBitCast, LowerVectorBroadcast and LowerVectorTranspose. --- .../Vector/Transforms/LowerVectorBitCast.cpp | 6 +-- .../Transforms/LowerVectorBroadcast.cpp | 19 ++------- .../Transforms/LowerVectorTranspose.cpp | 15 ++----- .../VectorToLLVM/vector-to-llvm.mlir | 36 ++++++++-------- .../vector-bitcast-lowering-transforms.mlir | 6 +-- .../vector-broadcast-lowering-transforms.mlir | 42 +++++++++---------- .../Vector/vector-transpose-lowering.mlir | 4 +- 7 files changed, 53 insertions(+), 75 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBitCast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBitCast.cpp index d8c4939dc742a..89930a6bd35fa 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBitCast.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBitCast.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/UB/IR/UBOps.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Utils/VectorUtils.h" @@ -32,7 +33,7 @@ namespace { /// /// Would be unrolled to: /// -/// %result = arith.constant dense<0> : vector<1x2x3x8xi32> +/// %result = ub.poison : vector<1x2x3x8xi32> /// %0 = vector.extract %a[0, 0, 0] ─┐ /// : vector<4xi64> from vector<1x2x3x4xi64> | /// %1 = vector.bitcast %0 | - Repeated 6x for @@ -63,8 +64,7 @@ class UnrollBitCastOp final : public OpRewritePattern { VectorType::get(shape, resultType.getElementType(), scalableDims); Location loc = op.getLoc(); - Value result = rewriter.create( - loc, resultType, rewriter.getZeroAttr(resultType)); + Value result = rewriter.create(loc, resultType); for (auto position : *unrollIterator) { Value extract = rewriter.create(loc, op.getSource(), position); diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp index 6c36bbaee8523..fec3c6c52e5e4 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp @@ -11,27 +11,16 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Arith/Utils/Utils.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Utils/IndexingUtils.h" -#include "mlir/Dialect/Utils/StructuredOpsUtils.h" +#include "mlir/Dialect/UB/IR/UBOps.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" #include "mlir/Dialect/Vector/Utils/VectorUtils.h" -#include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/Location.h" -#include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" -#include "mlir/Interfaces/VectorInterfaces.h" #define DEBUG_TYPE "vector-broadcast-lowering" @@ -86,8 +75,7 @@ class BroadcastOpLowering : public OpRewritePattern { VectorType resType = VectorType::Builder(dstType).dropDim(0); Value bcst = rewriter.create(loc, resType, op.getSource()); - Value result = rewriter.create( - loc, dstType, rewriter.getZeroAttr(dstType)); + Value result = rewriter.create(loc, dstType); for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d) result = rewriter.create(loc, bcst, result, d); rewriter.replaceOp(op, result); @@ -127,8 +115,7 @@ class BroadcastOpLowering : public OpRewritePattern { VectorType resType = VectorType::get(dstType.getShape().drop_front(), eltType, dstType.getScalableDims().drop_front()); - Value result = rewriter.create( - loc, dstType, rewriter.getZeroAttr(dstType)); + Value result = rewriter.create(loc, dstType); if (m == 0) { // Stetch at start. Value ext = rewriter.create(loc, op.getSource(), 0); diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp index 6135a1290d559..fb4dee33bc5f5 100644 --- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp @@ -11,26 +11,19 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Arith/Utils/Utils.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/UB/IR/UBOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Utils/VectorUtils.h" -#include "mlir/IR/BuiltinAttributeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/Location.h" -#include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" -#include "mlir/Interfaces/VectorInterfaces.h" #define DEBUG_TYPE "lower-vector-transpose" @@ -291,8 +284,7 @@ static Value transposeToShuffle16x16(OpBuilder &builder, Value source, int m, auto reshInputType = VectorType::get( {m, n}, cast(source.getType()).getElementType()); - Value res = - b.create(reshInputType, b.getZeroAttr(reshInputType)); + Value res = b.create(reshInputType); for (int64_t i = 0; i < m; ++i) res = b.create(vs[i], res, i); return res; @@ -368,8 +360,7 @@ class TransposeOpLowering : public OpRewritePattern { // of the leftmost transposed dimensions. We traverse every transpose // element using a linearized index that we delinearize to generate the // appropriate indices for the extract/insert operations. - Value result = rewriter.create( - loc, resType, rewriter.getZeroAttr(resType)); + Value result = rewriter.create(loc, resType); int64_t numTransposedElements = ShapedType::getNumElements(prunedInShape); for (int64_t linearIdx = 0; linearIdx < numTransposedElements; diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir index f17e8f02c0d80..36b37a137ac1e 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir @@ -185,7 +185,7 @@ func.func @broadcast_vec2d_from_vec0d(%arg0: vector) -> vector<3x2xf32> { // CHECK-LABEL: @broadcast_vec2d_from_vec0d( // CHECK-SAME: %[[A:.*]]: vector) // CHECK: %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector to vector<1xf32> -// CHECK: %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32> +// CHECK: %[[T1:.*]] = ub.poison : vector<3x2xf32> // CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<3x2xf32> to !llvm.array<3 x vector<2xf32>> // CHECK: %[[T4:.*]] = llvm.mlir.constant(0 : index) : i64 // CHECK: %[[T5:.*]] = llvm.extractelement %[[T0]][%[[T4]] : i64] : vector<1xf32> @@ -205,7 +205,7 @@ func.func @broadcast_vec2d_from_vec1d(%arg0: vector<2xf32>) -> vector<3x2xf32> { } // CHECK-LABEL: @broadcast_vec2d_from_vec1d( // CHECK-SAME: %[[A:.*]]: vector<2xf32>) -// CHECK: %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32> +// CHECK: %[[T0:.*]] = ub.poison : vector<3x2xf32> // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x2xf32> to !llvm.array<3 x vector<2xf32>> // CHECK: %[[T2:.*]] = llvm.insertvalue %[[A]], %[[T1]][0] : !llvm.array<3 x vector<2xf32>> // CHECK: %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][1] : !llvm.array<3 x vector<2xf32>> @@ -221,7 +221,7 @@ func.func @broadcast_vec2d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector } // CHECK-LABEL: @broadcast_vec2d_from_vec1d_scalable( // CHECK-SAME: %[[A:.*]]: vector<[2]xf32>) -// CHECK: %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32> +// CHECK: %[[T0:.*]] = ub.poison : vector<3x[2]xf32> // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>> // CHECK: %[[T2:.*]] = llvm.insertvalue %[[A]], %[[T1]][0] : !llvm.array<3 x vector<[2]xf32>> // CHECK: %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][1] : !llvm.array<3 x vector<[2]xf32>> @@ -238,7 +238,7 @@ func.func @broadcast_vec2d_from_index_vec1d(%arg0: vector<2xindex>) -> vector<3x // CHECK-LABEL: @broadcast_vec2d_from_index_vec1d( // CHECK-SAME: %[[A:.*]]: vector<2xindex>) // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<2xindex> to vector<2xi64> -// CHECK: %[[T0:.*]] = arith.constant dense<0> : vector<3x2xindex> +// CHECK: %[[T0:.*]] = ub.poison : vector<3x2xindex> // CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x2xindex> to !llvm.array<3 x vector<2xi64>> // CHECK: %[[T3:.*]] = llvm.insertvalue %[[T1]], %[[T2]][0] : !llvm.array<3 x vector<2xi64>> @@ -254,7 +254,7 @@ func.func @broadcast_vec2d_from_index_vec1d_scalable(%arg0: vector<[2]xindex>) - // CHECK-LABEL: @broadcast_vec2d_from_index_vec1d_scalable( // CHECK-SAME: %[[A:.*]]: vector<[2]xindex>) // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<[2]xindex> to vector<[2]xi64> -// CHECK: %[[T0:.*]] = arith.constant dense<0> : vector<3x[2]xindex> +// CHECK: %[[T0:.*]] = ub.poison : vector<3x[2]xindex> // CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xindex> to !llvm.array<3 x vector<[2]xi64>> // CHECK: %[[T3:.*]] = llvm.insertvalue %[[T1]], %[[T2]][0] : !llvm.array<3 x vector<[2]xi64>> @@ -269,9 +269,9 @@ func.func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32> } // CHECK-LABEL: @broadcast_vec3d_from_vec1d( // CHECK-SAME: %[[A:.*]]: vector<2xf32>) -// CHECK-DAG: %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32> +// CHECK-DAG: %[[T0:.*]] = ub.poison : vector<3x2xf32> // CHECK-DAG: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x2xf32> to !llvm.array<3 x vector<2xf32>> -// CHECK-DAG: %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32> +// CHECK-DAG: %[[T1:.*]] = ub.poison : vector<4x3x2xf32> // CHECK-DAG: %[[T6:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x2xf32> to !llvm.array<4 x array<3 x vector<2xf32>>> // CHECK: %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][0] : !llvm.array<3 x vector<2xf32>> @@ -294,9 +294,9 @@ func.func @broadcast_vec3d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector } // CHECK-LABEL: @broadcast_vec3d_from_vec1d_scalable( // CHECK-SAME: %[[A:.*]]: vector<[2]xf32>) -// CHECK-DAG: %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32> +// CHECK-DAG: %[[T0:.*]] = ub.poison : vector<3x[2]xf32> // CHECK-DAG: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>> -// CHECK-DAG: %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32> +// CHECK-DAG: %[[T1:.*]] = ub.poison : vector<4x3x[2]xf32> // CHECK-DAG: %[[T6:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>> // CHECK: %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][0] : !llvm.array<3 x vector<[2]xf32>> @@ -320,7 +320,7 @@ func.func @broadcast_vec3d_from_vec2d(%arg0: vector<3x2xf32>) -> vector<4x3x2xf3 // CHECK-LABEL: @broadcast_vec3d_from_vec2d( // CHECK-SAME: %[[A:.*]]: vector<3x2xf32>) // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<3x2xf32> to !llvm.array<3 x vector<2xf32>> -// CHECK: %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32> +// CHECK: %[[T0:.*]] = ub.poison : vector<4x3x2xf32> // CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<4x3x2xf32> to !llvm.array<4 x array<3 x vector<2xf32>>> // CHECK: %[[T3:.*]] = llvm.insertvalue %[[T1]], %[[T2]][0] : !llvm.array<4 x array<3 x vector<2xf32>>> // CHECK: %[[T5:.*]] = llvm.insertvalue %[[T1]], %[[T3]][1] : !llvm.array<4 x array<3 x vector<2xf32>>> @@ -338,7 +338,7 @@ func.func @broadcast_vec3d_from_vec2d_scalable(%arg0: vector<3x[2]xf32>) -> vect // CHECK-LABEL: @broadcast_vec3d_from_vec2d_scalable( // CHECK-SAME: %[[A:.*]]: vector<3x[2]xf32>) // CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>> -// CHECK: %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32> +// CHECK: %[[T0:.*]] = ub.poison : vector<4x3x[2]xf32> // CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>> // CHECK: %[[T3:.*]] = llvm.insertvalue %[[T1]], %[[T2]][0] : !llvm.array<4 x array<3 x vector<[2]xf32>>> // CHECK: %[[T5:.*]] = llvm.insertvalue %[[T1]], %[[T3]][1] : !llvm.array<4 x array<3 x vector<[2]xf32>>> @@ -385,7 +385,7 @@ func.func @broadcast_stretch_at_start(%arg0: vector<1x4xf32>) -> vector<3x4xf32> // CHECK-LABEL: @broadcast_stretch_at_start( // CHECK-SAME: %[[A:.*]]: vector<1x4xf32>) // CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<1x4xf32> to !llvm.array<1 x vector<4xf32>> -// CHECK: %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<3x4xf32> +// CHECK: %[[T1:.*]] = ub.poison : vector<3x4xf32> // CHECK: %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<3x4xf32> to !llvm.array<3 x vector<4xf32>> // CHECK: %[[T3:.*]] = llvm.extractvalue %[[T2]][0] : !llvm.array<1 x vector<4xf32>> // CHECK: %[[T5:.*]] = llvm.insertvalue %[[T3]], %[[T4]][0] : !llvm.array<3 x vector<4xf32>> @@ -403,7 +403,7 @@ func.func @broadcast_stretch_at_start_scalable(%arg0: vector<1x[4]xf32>) -> vect // CHECK-LABEL: @broadcast_stretch_at_start_scalable( // CHECK-SAME: %[[A:.*]]: vector<1x[4]xf32>) // CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<1x[4]xf32> to !llvm.array<1 x vector<[4]xf32>> -// CHECK: %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<3x[4]xf32> +// CHECK: %[[T1:.*]] = ub.poison : vector<3x[4]xf32> // CHECK: %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<3x[4]xf32> to !llvm.array<3 x vector<[4]xf32>> // CHECK: %[[T3:.*]] = llvm.extractvalue %[[T2]][0] : !llvm.array<1 x vector<[4]xf32>> // CHECK: %[[T5:.*]] = llvm.insertvalue %[[T3]], %[[T4]][0] : !llvm.array<3 x vector<[4]xf32>> @@ -421,7 +421,7 @@ func.func @broadcast_stretch_at_end(%arg0: vector<4x1xf32>) -> vector<4x3xf32> { // CHECK-LABEL: @broadcast_stretch_at_end( // CHECK-SAME: %[[A:.*]]: vector<4x1xf32>) // CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<4x1xf32> to !llvm.array<4 x vector<1xf32>> -// CHECK: %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3xf32> +// CHECK: %[[T1:.*]] = ub.poison : vector<4x3xf32> // CHECK: %[[T7:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3xf32> to !llvm.array<4 x vector<3xf32>> // CHECK: %[[T3:.*]] = llvm.extractvalue %[[T2]][0] : !llvm.array<4 x vector<1xf32>> // CHECK: %[[T4:.*]] = llvm.mlir.constant(0 : i64) : i64 @@ -469,9 +469,9 @@ func.func @broadcast_stretch_in_middle(%arg0: vector<4x1x2xf32>) -> vector<4x3x2 // CHECK-LABEL: @broadcast_stretch_in_middle( // CHECK-SAME: %[[A:.*]]: vector<4x1x2xf32>) -> vector<4x3x2xf32> { // CHECK: %[[T3:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<4x1x2xf32> to !llvm.array<4 x array<1 x vector<2xf32>>> -// CHECK: %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32> +// CHECK: %[[T1:.*]] = ub.poison : vector<4x3x2xf32> // CHECK: %[[T9:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x2xf32> to !llvm.array<4 x array<3 x vector<2xf32>>> -// CHECK: %[[T2:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32> +// CHECK: %[[T2:.*]] = ub.poison : vector<3x2xf32> // CHECK: %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T2]] : vector<3x2xf32> to !llvm.array<3 x vector<2xf32>> // CHECK: %[[T4:.*]] = llvm.extractvalue %[[T3]][0, 0] : !llvm.array<4 x array<1 x vector<2xf32>>> // CHECK: %[[T6:.*]] = llvm.insertvalue %[[T4]], %[[T5]][0] : !llvm.array<3 x vector<2xf32>> @@ -505,9 +505,9 @@ func.func @broadcast_stretch_in_middle_scalable_v1(%arg0: vector<4x1x[2]xf32>) - // CHECK-LABEL: @broadcast_stretch_in_middle_scalable_v1( // CHECK-SAME: %[[A:.*]]: vector<4x1x[2]xf32>) -> vector<4x3x[2]xf32> { // CHECK: %[[T3:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<4x1x[2]xf32> to !llvm.array<4 x array<1 x vector<[2]xf32>>> -// CHECK: %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32> +// CHECK: %[[T1:.*]] = ub.poison : vector<4x3x[2]xf32> // CHECK: %[[T9:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>> -// CHECK: %[[T2:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32> +// CHECK: %[[T2:.*]] = ub.poison : vector<3x[2]xf32> // CHECK: %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T2]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>> // CHECK: %[[T4:.*]] = llvm.extractvalue %[[T3]][0, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>> // CHECK: %[[T6:.*]] = llvm.insertvalue %[[T4]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>> diff --git a/mlir/test/Dialect/Vector/vector-bitcast-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-bitcast-lowering-transforms.mlir index 346291019451c..29e7007666e87 100644 --- a/mlir/test/Dialect/Vector/vector-bitcast-lowering-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-bitcast-lowering-transforms.mlir @@ -24,7 +24,7 @@ func.func @vector_bitcast_2d(%arg0: vector<2x4xi32>) -> vector<2x2xi64> { } // CHECK-LABEL: func.func @vector_bitcast_2d // CHECK-SAME: %[[IN:[a-zA-Z0-9]+]] -// CHECK: %[[INIT:.+]] = arith.constant {{.+}} : vector<2x2xi64> +// CHECK: %[[INIT:.+]] = ub.poison : vector<2x2xi64> // CHECK: %[[V1:.+]] = vector.extract %[[IN]][0] : vector<4xi32> from vector<2x4xi32> // CHECK: %[[B1:.+]] = vector.bitcast %[[V1]] : vector<4xi32> to vector<2xi64> // CHECK: %[[R1:.+]] = vector.insert %[[B1]], %[[INIT]] [0] @@ -39,7 +39,7 @@ func.func @vector_bitcast_4d_with_scalable_dim(%arg0: vector<1x2x[3]x4xi64>) -> } // CHECK-LABEL: func.func @vector_bitcast_4d_with_scalable_dim // CHECK-SAME: %[[IN:[a-zA-Z0-9]+]] -// CHECK: %[[INIT:.+]] = arith.constant dense<0> : vector<1x2x[3]x8xi32> +// CHECK: %[[INIT:.+]] = ub.poison : vector<1x2x[3]x8xi32> // CHECK: %[[V1:.+]] = vector.extract %[[IN]][0, 0] : vector<[3]x4xi64> from vector<1x2x[3]x4xi64> // CHECK: %[[B1:.+]] = vector.bitcast %[[V1]] : vector<[3]x4xi64> to vector<[3]x8xi32> // CHECK: %[[R1:.+]] = vector.insert %[[B1]], %[[INIT]] [0, 0] : vector<[3]x8xi32> into vector<1x2x[3]x8xi32> @@ -54,7 +54,7 @@ func.func @vector_bitcast_2d_trailing_scalable_dim(%arg0: vector<2x[2]xi64>) -> } // CHECK-LABEL: func.func @vector_bitcast_2d_trailing_scalable_dim // CHECK-SAME: %[[IN:[a-zA-Z0-9]+]] -// CHECK: %[[INIT:.+]] = arith.constant dense<0> : vector<2x[4]xi32> +// CHECK: %[[INIT:.+]] = ub.poison : vector<2x[4]xi32> // CHECK: %[[V1:.+]] = vector.extract %[[IN]][0] : vector<[2]xi64> from vector<2x[2]xi64> // CHECK: %[[B1:.+]] = vector.bitcast %[[V1]] : vector<[2]xi64> to vector<[4]xi32> // CHECK: %[[R1:.+]] = vector.insert %[[B1]], %[[INIT]] [0] : vector<[4]xi32> into vector<2x[4]xi32> diff --git a/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir index 4a5ea439134cf..8e167a520260f 100644 --- a/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-broadcast-lowering-transforms.mlir @@ -41,8 +41,8 @@ func.func @broadcast_vec1d_from_vec1d(%arg0: vector<2xf32>) -> vector<2xf32> { // CHECK-LABEL: func @broadcast_vec2d_from_vec1d // CHECK-SAME: %[[A:.*0]]: vector<2xf32> -// CHECK: %[[C0:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32> -// CHECK: %[[T0:.*]] = vector.insert %[[A]], %[[C0]] [0] : vector<2xf32> into vector<3x2xf32> +// CHECK: %[[U0:.*]] = ub.poison : vector<3x2xf32> +// CHECK: %[[T0:.*]] = vector.insert %[[A]], %[[U0]] [0] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T1:.*]] = vector.insert %[[A]], %[[T0]] [1] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T2:.*]] = vector.insert %[[A]], %[[T1]] [2] : vector<2xf32> into vector<3x2xf32> // CHECK: return %[[T2]] : vector<3x2xf32> @@ -54,12 +54,12 @@ func.func @broadcast_vec2d_from_vec1d(%arg0: vector<2xf32>) -> vector<3x2xf32> { // CHECK-LABEL: func @broadcast_vec3d_from_vec1d // CHECK-SAME: %[[A:.*0]]: vector<2xf32> -// CHECK-DAG: %[[C0:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32> -// CHECK-DAG: %[[C1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32> -// CHECK: %[[T0:.*]] = vector.insert %[[A]], %[[C0]] [0] : vector<2xf32> into vector<3x2xf32> +// CHECK-DAG: %[[U0:.*]] = ub.poison : vector<3x2xf32> +// CHECK-DAG: %[[U1:.*]] = ub.poison : vector<4x3x2xf32> +// CHECK: %[[T0:.*]] = vector.insert %[[A]], %[[U0]] [0] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T1:.*]] = vector.insert %[[A]], %[[T0]] [1] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T2:.*]] = vector.insert %[[A]], %[[T1]] [2] : vector<2xf32> into vector<3x2xf32> -// CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[C1]] [0] : vector<3x2xf32> into vector<4x3x2xf32> +// CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[U1]] [0] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T4:.*]] = vector.insert %[[T2]], %[[T3]] [1] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T5:.*]] = vector.insert %[[T2]], %[[T4]] [2] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T6:.*]] = vector.insert %[[T2]], %[[T5]] [3] : vector<3x2xf32> into vector<4x3x2xf32> @@ -72,8 +72,8 @@ func.func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32> // CHECK-LABEL: func @broadcast_vec3d_from_vec2d // CHECK-SAME: %[[A:.*0]]: vector<3x2xf32> -// CHECK: %[[C0:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32> -// CHECK: %[[T0:.*]] = vector.insert %[[A]], %[[C0]] [0] : vector<3x2xf32> into vector<4x3x2xf32> +// CHECK: %[[U0:.*]] = ub.poison : vector<4x3x2xf32> +// CHECK: %[[T0:.*]] = vector.insert %[[A]], %[[U0]] [0] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T1:.*]] = vector.insert %[[A]], %[[T0]] [1] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T2:.*]] = vector.insert %[[A]], %[[T1]] [2] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T3:.*]] = vector.insert %[[A]], %[[T2]] [3] : vector<3x2xf32> into vector<4x3x2xf32> @@ -97,9 +97,9 @@ func.func @broadcast_stretch(%arg0: vector<1xf32>) -> vector<4xf32> { // CHECK-LABEL: func @broadcast_stretch_at_start // CHECK-SAME: %[[A:.*0]]: vector<1x4xf32> -// CHECK: %[[C0:.*]] = arith.constant dense<0.000000e+00> : vector<3x4xf32> +// CHECK: %[[U0:.*]] = ub.poison : vector<3x4xf32> // CHECK: %[[T0:.*]] = vector.extract %[[A]][0] : vector<4xf32> from vector<1x4xf32> -// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[C0]] [0] : vector<4xf32> into vector<3x4xf32> +// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[U0]] [0] : vector<4xf32> into vector<3x4xf32> // CHECK: %[[T2:.*]] = vector.insert %[[T0]], %[[T1]] [1] : vector<4xf32> into vector<3x4xf32> // CHECK: %[[T3:.*]] = vector.insert %[[T0]], %[[T2]] [2] : vector<4xf32> into vector<3x4xf32> // CHECK: return %[[T3]] : vector<3x4xf32> @@ -111,10 +111,10 @@ func.func @broadcast_stretch_at_start(%arg0: vector<1x4xf32>) -> vector<3x4xf32> // CHECK-LABEL: func @broadcast_stretch_at_end // CHECK-SAME: %[[A:.*0]]: vector<4x1xf32> -// CHECK: %[[C0:.*]] = arith.constant dense<0.000000e+00> : vector<4x3xf32> +// CHECK: %[[U0:.*]] = ub.poison : vector<4x3xf32> // CHECK: %[[T0:.*]] = vector.extract %[[A]][0, 0] : f32 from vector<4x1xf32> // CHECK: %[[T2:.*]] = vector.splat %[[T0]] : vector<3xf32> -// CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[C0]] [0] : vector<3xf32> into vector<4x3xf32> +// CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[U0]] [0] : vector<3xf32> into vector<4x3xf32> // CHECK: %[[T4:.*]] = vector.extract %[[A]][1, 0] : f32 from vector<4x1xf32> // CHECK: %[[T6:.*]] = vector.splat %[[T4]] : vector<3xf32> // CHECK: %[[T7:.*]] = vector.insert %[[T6]], %[[T3]] [1] : vector<3xf32> into vector<4x3xf32> @@ -133,25 +133,25 @@ func.func @broadcast_stretch_at_end(%arg0: vector<4x1xf32>) -> vector<4x3xf32> { // CHECK-LABEL: func @broadcast_stretch_in_middle // CHECK-SAME: %[[A:.*0]]: vector<4x1x2xf32> -// CHECK: %[[C0:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x2xf32> -// CHECK: %[[C1:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32> +// CHECK: %[[U0:.*]] = ub.poison : vector<4x3x2xf32> +// CHECK: %[[U1:.*]] = ub.poison : vector<3x2xf32> // CHECK: %[[T0:.*]] = vector.extract %[[A]][0, 0] : vector<2xf32> from vector<4x1x2xf32> -// CHECK: %[[T2:.*]] = vector.insert %[[T0]], %[[C1]] [0] : vector<2xf32> into vector<3x2xf32> +// CHECK: %[[T2:.*]] = vector.insert %[[T0]], %[[U1]] [0] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T3:.*]] = vector.insert %[[T0]], %[[T2]] [1] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T4:.*]] = vector.insert %[[T0]], %[[T3]] [2] : vector<2xf32> into vector<3x2xf32> -// CHECK: %[[T5:.*]] = vector.insert %[[T4]], %[[C0]] [0] : vector<3x2xf32> into vector<4x3x2xf32> +// CHECK: %[[T5:.*]] = vector.insert %[[T4]], %[[U0]] [0] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T6:.*]] = vector.extract %[[A]][1, 0] : vector<2xf32> from vector<4x1x2xf32> -// CHECK: %[[T8:.*]] = vector.insert %[[T6]], %[[C1]] [0] : vector<2xf32> into vector<3x2xf32> +// CHECK: %[[T8:.*]] = vector.insert %[[T6]], %[[U1]] [0] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T9:.*]] = vector.insert %[[T6]], %[[T8]] [1] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T10:.*]] = vector.insert %[[T6]], %[[T9]] [2] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T11:.*]] = vector.insert %[[T10]], %[[T5]] [1] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T12:.*]] = vector.extract %[[A]][2, 0] : vector<2xf32> from vector<4x1x2xf32> -// CHECK: %[[T14:.*]] = vector.insert %[[T12]], %[[C1]] [0] : vector<2xf32> into vector<3x2xf32> +// CHECK: %[[T14:.*]] = vector.insert %[[T12]], %[[U1]] [0] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T15:.*]] = vector.insert %[[T12]], %[[T14]] [1] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T16:.*]] = vector.insert %[[T12]], %[[T15]] [2] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T17:.*]] = vector.insert %[[T16]], %[[T11]] [2] : vector<3x2xf32> into vector<4x3x2xf32> // CHECK: %[[T18:.*]] = vector.extract %[[A]][3, 0] : vector<2xf32> from vector<4x1x2xf32> -// CHECK: %[[T20:.*]] = vector.insert %[[T18]], %[[C1]] [0] : vector<2xf32> into vector<3x2xf32> +// CHECK: %[[T20:.*]] = vector.insert %[[T18]], %[[U1]] [0] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T21:.*]] = vector.insert %[[T18]], %[[T20]] [1] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T22:.*]] = vector.insert %[[T18]], %[[T21]] [2] : vector<2xf32> into vector<3x2xf32> // CHECK: %[[T23:.*]] = vector.insert %[[T22]], %[[T17]] [3] : vector<3x2xf32> into vector<4x3x2xf32> @@ -164,8 +164,8 @@ func.func @broadcast_stretch_in_middle(%arg0: vector<4x1x2xf32>) -> vector<4x3x2 // CHECK-LABEL: func.func @broadcast_scalable_duplication // CHECK-SAME: %[[ARG0:.*]]: vector<[32]xf32>) -// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<1x[32]xf32> -// CHECK: %[[RES:.*]] = vector.insert %[[ARG0]], %[[CST]] [0] : vector<[32]xf32> into vector<1x[32]xf32> +// CHECK: %[[INIT:.*]] = ub.poison : vector<1x[32]xf32> +// CHECK: %[[RES:.*]] = vector.insert %[[ARG0]], %[[INIT]] [0] : vector<[32]xf32> into vector<1x[32]xf32> // CHECK: return %[[RES]] : vector<1x[32]xf32> func.func @broadcast_scalable_duplication(%arg0: vector<[32]xf32>) -> vector<1x[32]xf32> { diff --git a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir index 219a72df52a19..83395504e8c74 100644 --- a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir +++ b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir @@ -2,9 +2,9 @@ // CHECK-LABEL: func @transpose23 // CHECK-SAME: %[[A:.*]]: vector<2x3xf32> -// CHECK: %[[Z:.*]] = arith.constant dense<0.000000e+00> : vector<3x2xf32> +// CHECK: %[[UB:.*]] = ub.poison : vector<3x2xf32> // CHECK: %[[T0:.*]] = vector.extract %[[A]][0, 0] : f32 from vector<2x3xf32> -// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[Z]] [0, 0] : f32 into vector<3x2xf32> +// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[UB]] [0, 0] : f32 into vector<3x2xf32> // CHECK: %[[T2:.*]] = vector.extract %[[A]][0, 1] : f32 from vector<2x3xf32> // CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[T1]] [1, 0] : f32 into vector<3x2xf32> // CHECK: %[[T4:.*]] = vector.extract %[[A]][0, 2] : f32 from vector<2x3xf32> From f2a1103b323492160d7d27a1575fbda709b49036 Mon Sep 17 00:00:00 2001 From: Krishna Pandey <47917477+krishna2803@users.noreply.github.com> Date: Sat, 8 Feb 2025 00:23:17 +0530 Subject: [PATCH 005/293] [libc][stdfix] Implement fixed point `countlsfx` functions in llvm-libc (#125356) fixes #113357 --- libc/config/baremetal/arm/entrypoints.txt | 12 +++ libc/config/baremetal/riscv/entrypoints.txt | 12 +++ libc/config/linux/riscv/entrypoints.txt | 12 +++ libc/config/linux/x86_64/entrypoints.txt | 12 +++ libc/docs/headers/math/stdfix.rst | 2 +- libc/include/stdfix.yaml | 84 +++++++++++++++++++ libc/src/__support/fixed_point/CMakeLists.txt | 1 + libc/src/__support/fixed_point/fx_bits.h | 37 +++++++- libc/src/stdfix/CMakeLists.txt | 12 +++ libc/src/stdfix/countlshk.cpp | 20 +++++ libc/src/stdfix/countlshk.h | 21 +++++ libc/src/stdfix/countlshr.cpp | 20 +++++ libc/src/stdfix/countlshr.h | 21 +++++ libc/src/stdfix/countlsk.cpp | 18 ++++ libc/src/stdfix/countlsk.h | 21 +++++ libc/src/stdfix/countlslk.cpp | 20 +++++ libc/src/stdfix/countlslk.h | 21 +++++ libc/src/stdfix/countlslr.cpp | 20 +++++ libc/src/stdfix/countlslr.h | 21 +++++ libc/src/stdfix/countlsr.cpp | 18 ++++ libc/src/stdfix/countlsr.h | 21 +++++ libc/src/stdfix/countlsuhk.cpp | 20 +++++ libc/src/stdfix/countlsuhk.h | 21 +++++ libc/src/stdfix/countlsuhr.cpp | 20 +++++ libc/src/stdfix/countlsuhr.h | 21 +++++ libc/src/stdfix/countlsuk.cpp | 20 +++++ libc/src/stdfix/countlsuk.h | 21 +++++ libc/src/stdfix/countlsulk.cpp | 20 +++++ libc/src/stdfix/countlsulk.h | 21 +++++ libc/src/stdfix/countlsulr.cpp | 20 +++++ libc/src/stdfix/countlsulr.h | 21 +++++ libc/src/stdfix/countlsur.cpp | 20 +++++ libc/src/stdfix/countlsur.h | 21 +++++ libc/test/src/stdfix/CMakeLists.txt | 16 ++++ libc/test/src/stdfix/CountlsTest.h | 62 ++++++++++++++ libc/test/src/stdfix/countlshk_test.cpp | 13 +++ libc/test/src/stdfix/countlshr_test.cpp | 13 +++ libc/test/src/stdfix/countlsk_test.cpp | 13 +++ libc/test/src/stdfix/countlslk_test.cpp | 13 +++ libc/test/src/stdfix/countlslr_test.cpp | 13 +++ libc/test/src/stdfix/countlsr_test.cpp | 13 +++ libc/test/src/stdfix/countlsuhk_test.cpp | 13 +++ libc/test/src/stdfix/countlsuhr_test.cpp | 13 +++ libc/test/src/stdfix/countlsuk_test.cpp | 13 +++ libc/test/src/stdfix/countlsulk_test.cpp | 13 +++ libc/test/src/stdfix/countlsulr_test.cpp | 13 +++ libc/test/src/stdfix/countlsur_test.cpp | 13 +++ 47 files changed, 903 insertions(+), 3 deletions(-) create mode 100644 libc/src/stdfix/countlshk.cpp create mode 100644 libc/src/stdfix/countlshk.h create mode 100644 libc/src/stdfix/countlshr.cpp create mode 100644 libc/src/stdfix/countlshr.h create mode 100644 libc/src/stdfix/countlsk.cpp create mode 100644 libc/src/stdfix/countlsk.h create mode 100644 libc/src/stdfix/countlslk.cpp create mode 100644 libc/src/stdfix/countlslk.h create mode 100644 libc/src/stdfix/countlslr.cpp create mode 100644 libc/src/stdfix/countlslr.h create mode 100644 libc/src/stdfix/countlsr.cpp create mode 100644 libc/src/stdfix/countlsr.h create mode 100644 libc/src/stdfix/countlsuhk.cpp create mode 100644 libc/src/stdfix/countlsuhk.h create mode 100644 libc/src/stdfix/countlsuhr.cpp create mode 100644 libc/src/stdfix/countlsuhr.h create mode 100644 libc/src/stdfix/countlsuk.cpp create mode 100644 libc/src/stdfix/countlsuk.h create mode 100644 libc/src/stdfix/countlsulk.cpp create mode 100644 libc/src/stdfix/countlsulk.h create mode 100644 libc/src/stdfix/countlsulr.cpp create mode 100644 libc/src/stdfix/countlsulr.h create mode 100644 libc/src/stdfix/countlsur.cpp create mode 100644 libc/src/stdfix/countlsur.h create mode 100644 libc/test/src/stdfix/CountlsTest.h create mode 100644 libc/test/src/stdfix/countlshk_test.cpp create mode 100644 libc/test/src/stdfix/countlshr_test.cpp create mode 100644 libc/test/src/stdfix/countlsk_test.cpp create mode 100644 libc/test/src/stdfix/countlslk_test.cpp create mode 100644 libc/test/src/stdfix/countlslr_test.cpp create mode 100644 libc/test/src/stdfix/countlsr_test.cpp create mode 100644 libc/test/src/stdfix/countlsuhk_test.cpp create mode 100644 libc/test/src/stdfix/countlsuhr_test.cpp create mode 100644 libc/test/src/stdfix/countlsuk_test.cpp create mode 100644 libc/test/src/stdfix/countlsulk_test.cpp create mode 100644 libc/test/src/stdfix/countlsulr_test.cpp create mode 100644 libc/test/src/stdfix/countlsur_test.cpp diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index 694cd7b1993ca..351f727389e3a 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -469,6 +469,18 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits + libc.src.stdfix.countlshr + libc.src.stdfix.countlsr + libc.src.stdfix.countlslr + libc.src.stdfix.countlshk + libc.src.stdfix.countlsk + libc.src.stdfix.countlslk + libc.src.stdfix.countlsuhr + libc.src.stdfix.countlsur + libc.src.stdfix.countlsulr + libc.src.stdfix.countlsuhk + libc.src.stdfix.countlsuk + libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index 667ab40dca999..39c70a22a21e0 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -464,6 +464,18 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits + libc.src.stdfix.countlshr + libc.src.stdfix.countlsr + libc.src.stdfix.countlslr + libc.src.stdfix.countlshk + libc.src.stdfix.countlsk + libc.src.stdfix.countlslk + libc.src.stdfix.countlsuhr + libc.src.stdfix.countlsur + libc.src.stdfix.countlsulr + libc.src.stdfix.countlsuhk + libc.src.stdfix.countlsuk + libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 6e67ea559d57b..a9ba0c257755b 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -749,6 +749,18 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) # TODO: https://github.com/llvm/llvm-project/issues/115778 libc.src.stdfix.lkbits libc.src.stdfix.ulkbits + libc.src.stdfix.countlshr + libc.src.stdfix.countlsr + libc.src.stdfix.countlslr + libc.src.stdfix.countlshk + libc.src.stdfix.countlsk + libc.src.stdfix.countlslk + libc.src.stdfix.countlsuhr + libc.src.stdfix.countlsur + libc.src.stdfix.countlsulr + libc.src.stdfix.countlsuhk + libc.src.stdfix.countlsuk + libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 81dceb74a1774..2a4c17a56f377 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -874,6 +874,18 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits + libc.src.stdfix.countlshr + libc.src.stdfix.countlsr + libc.src.stdfix.countlslr + libc.src.stdfix.countlshk + libc.src.stdfix.countlsk + libc.src.stdfix.countlslk + libc.src.stdfix.countlsuhr + libc.src.stdfix.countlsur + libc.src.stdfix.countlsulr + libc.src.stdfix.countlsuhk + libc.src.stdfix.countlsuk + libc.src.stdfix.countlsulk ) endif() diff --git a/libc/docs/headers/math/stdfix.rst b/libc/docs/headers/math/stdfix.rst index 58052f000995c..4507f2b608bf1 100644 --- a/libc/docs/headers/math/stdfix.rst +++ b/libc/docs/headers/math/stdfix.rst @@ -73,7 +73,7 @@ The following functions are included in the ISO/IEC TR 18037:2008 standard. +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ | \*bits | | | | | | | | | | | | | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ -| countls | | | | | | | | | | | | | +| countls | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ | divi | | | | | | | | | | | | | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ diff --git a/libc/include/stdfix.yaml b/libc/include/stdfix.yaml index 9663ac0c7df4d..0abf2f3a9b3b6 100644 --- a/libc/include/stdfix.yaml +++ b/libc/include/stdfix.yaml @@ -306,3 +306,87 @@ functions: arguments: - type: unsigned int guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlshr + standards: + - stdc_ext + return_type: int + arguments: + - type: short fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsr + standards: + - stdc_ext + return_type: int + arguments: + - type: fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlslr + standards: + - stdc_ext + return_type: int + arguments: + - type: long fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlshk + standards: + - stdc_ext + return_type: int + arguments: + - type: short accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsk + standards: + - stdc_ext + return_type: int + arguments: + - type: accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlslk + standards: + - stdc_ext + return_type: int + arguments: + - type: long accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsuhr + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned short fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsur + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsulr + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned long fract + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsuhk + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned short accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsuk + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned accum + guard: LIBC_COMPILER_HAS_FIXED_POINT + - name: countlsulk + standards: + - stdc_ext + return_type: int + arguments: + - type: unsigned long accum + guard: LIBC_COMPILER_HAS_FIXED_POINT diff --git a/libc/src/__support/fixed_point/CMakeLists.txt b/libc/src/__support/fixed_point/CMakeLists.txt index 3b744081765e4..b415e2c00c488 100644 --- a/libc/src/__support/fixed_point/CMakeLists.txt +++ b/libc/src/__support/fixed_point/CMakeLists.txt @@ -19,6 +19,7 @@ add_header_library( libc.src.__support.macros.optimization libc.src.__support.CPP.type_traits libc.src.__support.CPP.bit + libc.src.__support.CPP.limits libc.src.__support.math_extras ) diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h index 225ea417760a0..21985e6442534 100644 --- a/libc/src/__support/fixed_point/fx_bits.h +++ b/libc/src/__support/fixed_point/fx_bits.h @@ -11,9 +11,10 @@ #include "include/llvm-libc-macros/stdfix-macros.h" #include "src/__support/CPP/bit.h" +#include "src/__support/CPP/limits.h" // numeric_limits #include "src/__support/CPP/type_traits.h" -#include "src/__support/macros/attributes.h" // LIBC_INLINE -#include "src/__support/macros/config.h" +#include "src/__support/macros/attributes.h" // LIBC_INLINE +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/math_extras.h" @@ -50,6 +51,12 @@ template struct FXBits { static constexpr StorageType SIGN_MASK = (fx_rep::SIGN_LEN == 0 ? 0 : StorageType(1) << SIGN_OFFSET); + // mask for + static constexpr StorageType VALUE_MASK = INTEGRAL_MASK | FRACTION_MASK; + + // mask for + static constexpr StorageType TOTAL_MASK = SIGN_MASK | VALUE_MASK; + public: LIBC_INLINE constexpr FXBits() = default; @@ -74,6 +81,12 @@ template struct FXBits { return (value & INTEGRAL_MASK) >> INTEGRAL_OFFSET; } + // returns complete bitstring representation the fixed point number + // the bitstring is of the form: padding | sign | integral | fraction + LIBC_INLINE constexpr StorageType get_bits() { + return (value & TOTAL_MASK) >> FRACTION_OFFSET; + } + // TODO: replace bool with Sign LIBC_INLINE constexpr bool get_sign() { return static_cast((value & SIGN_MASK) >> SIGN_OFFSET); @@ -163,6 +176,26 @@ template LIBC_INLINE constexpr T round(T x, int n) { return bit_and((x + round_bit), rounding_mask); } +// count leading sign bits +template +LIBC_INLINE constexpr cpp::enable_if_t, int> +countls(T f) { + using FXRep = FXRep; + using BitType = typename FXRep::StorageType; + using FXBits = FXBits; + + constexpr int CONTAIN_LEN = cpp::numeric_limits::digits; + constexpr int PADDING_LEN = CONTAIN_LEN - FXRep::TOTAL_LEN; + + if constexpr (FXRep::SIGN_LEN != 0) { + if (x < 0) + x = bit_not(x); + } + + BitType value_bits = FXBits(x)::get_bits(); + return cpp::countl_zero(value_bits) - PADDING_LEN; +} + } // namespace fixed_point } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/CMakeLists.txt b/libc/src/stdfix/CMakeLists.txt index 815f739d23efa..3f6f9125a086b 100644 --- a/libc/src/stdfix/CMakeLists.txt +++ b/libc/src/stdfix/CMakeLists.txt @@ -53,6 +53,18 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) libc.src.__support.CPP.bit libc.src.__support.fixed_point.fx_bits ) + + add_entrypoint_object( + countls${suffix} + HDRS + countls${suffix}.h + SRCS + countls${suffix}.cpp + COMPILE_OPTIONS + ${libc_opt_high_flag} + DEPENDS + libc.src.__support.fixed_point.fx_bits + ) endforeach() add_entrypoint_object( diff --git a/libc/src/stdfix/countlshk.cpp b/libc/src/stdfix/countlshk.cpp new file mode 100644 index 0000000000000..f94728beff1cb --- /dev/null +++ b/libc/src/stdfix/countlshk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlshk function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlshk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlshk, (short accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlshk.h b/libc/src/stdfix/countlshk.h new file mode 100644 index 0000000000000..ab334244e166a --- /dev/null +++ b/libc/src/stdfix/countlshk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlshk function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlshk(short accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H diff --git a/libc/src/stdfix/countlshr.cpp b/libc/src/stdfix/countlshr.cpp new file mode 100644 index 0000000000000..d77d3e9a3c22a --- /dev/null +++ b/libc/src/stdfix/countlshr.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlshr function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlshr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlshr, (short fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlshr.h b/libc/src/stdfix/countlshr.h new file mode 100644 index 0000000000000..579b7b680406e --- /dev/null +++ b/libc/src/stdfix/countlshr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlshr function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlshr(short fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H diff --git a/libc/src/stdfix/countlsk.cpp b/libc/src/stdfix/countlsk.cpp new file mode 100644 index 0000000000000..b6f56adee16a6 --- /dev/null +++ b/libc/src/stdfix/countlsk.cpp @@ -0,0 +1,18 @@ +//===-- Implementation for countlsk function -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsk, (accum f)) { return fixed_point::countls(f); } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsk.h b/libc/src/stdfix/countlsk.h new file mode 100644 index 0000000000000..d0c893bc078d5 --- /dev/null +++ b/libc/src/stdfix/countlsk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsk function -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsk(accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSK_H diff --git a/libc/src/stdfix/countlslk.cpp b/libc/src/stdfix/countlslk.cpp new file mode 100644 index 0000000000000..9bf30ff34c6ee --- /dev/null +++ b/libc/src/stdfix/countlslk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlslk function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlslk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlslk, (long accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlslk.h b/libc/src/stdfix/countlslk.h new file mode 100644 index 0000000000000..60fa469797b7a --- /dev/null +++ b/libc/src/stdfix/countlslk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlslk function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlslk(long accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H diff --git a/libc/src/stdfix/countlslr.cpp b/libc/src/stdfix/countlslr.cpp new file mode 100644 index 0000000000000..774023c734a37 --- /dev/null +++ b/libc/src/stdfix/countlslr.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlslr function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlslr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlslr, (long fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlslr.h b/libc/src/stdfix/countlslr.h new file mode 100644 index 0000000000000..c909551e77a1a --- /dev/null +++ b/libc/src/stdfix/countlslr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlslr function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlslr(long fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H diff --git a/libc/src/stdfix/countlsr.cpp b/libc/src/stdfix/countlsr.cpp new file mode 100644 index 0000000000000..14563127ad5e9 --- /dev/null +++ b/libc/src/stdfix/countlsr.cpp @@ -0,0 +1,18 @@ +//===-- Implementation for countlsr function -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsr, (fract f)) { return fixed_point::countls(f); } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsr.h b/libc/src/stdfix/countlsr.h new file mode 100644 index 0000000000000..75dcf4aff0ca3 --- /dev/null +++ b/libc/src/stdfix/countlsr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsr function -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsr(fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSR_H diff --git a/libc/src/stdfix/countlsuhk.cpp b/libc/src/stdfix/countlsuhk.cpp new file mode 100644 index 0000000000000..2cc266f47da1f --- /dev/null +++ b/libc/src/stdfix/countlsuhk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsuhk function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsuhk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsuhk, (unsigned short accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuhk.h b/libc/src/stdfix/countlsuhk.h new file mode 100644 index 0000000000000..fcb2fec3500d4 --- /dev/null +++ b/libc/src/stdfix/countlsuhk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsuhk function -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsuhk(unsigned short accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H diff --git a/libc/src/stdfix/countlsuhr.cpp b/libc/src/stdfix/countlsuhr.cpp new file mode 100644 index 0000000000000..f30b0dd731aa9 --- /dev/null +++ b/libc/src/stdfix/countlsuhr.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsuhr function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsuhr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsuhr, (unsigned short fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuhr.h b/libc/src/stdfix/countlsuhr.h new file mode 100644 index 0000000000000..c6ce001d38b11 --- /dev/null +++ b/libc/src/stdfix/countlsuhr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsuhr function -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsuhr(unsigned long fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H diff --git a/libc/src/stdfix/countlsuk.cpp b/libc/src/stdfix/countlsuk.cpp new file mode 100644 index 0000000000000..3f32ba0815b6e --- /dev/null +++ b/libc/src/stdfix/countlsuk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsuhk function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsuhk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsuhk, (unsigned accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuk.h b/libc/src/stdfix/countlsuk.h new file mode 100644 index 0000000000000..7ad0e701b927b --- /dev/null +++ b/libc/src/stdfix/countlsuk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsuk function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsuk(unsigned accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H diff --git a/libc/src/stdfix/countlsulk.cpp b/libc/src/stdfix/countlsulk.cpp new file mode 100644 index 0000000000000..04090dd86c732 --- /dev/null +++ b/libc/src/stdfix/countlsulk.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsulk function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsulk.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsulk, (unsigned long accum f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsulk.h b/libc/src/stdfix/countlsulk.h new file mode 100644 index 0000000000000..55ca9d2e20ff0 --- /dev/null +++ b/libc/src/stdfix/countlsulk.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsulk function -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsulk(unsigned long accum f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H diff --git a/libc/src/stdfix/countlsulr.cpp b/libc/src/stdfix/countlsulr.cpp new file mode 100644 index 0000000000000..d9d6ff404c211 --- /dev/null +++ b/libc/src/stdfix/countlsulr.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsulr function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsulr.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsulr, (unsigned long fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsulr.h b/libc/src/stdfix/countlsulr.h new file mode 100644 index 0000000000000..59e7d726d01b9 --- /dev/null +++ b/libc/src/stdfix/countlsulr.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsulr function -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsulr(unsigned long fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H diff --git a/libc/src/stdfix/countlsur.cpp b/libc/src/stdfix/countlsur.cpp new file mode 100644 index 0000000000000..777e5f387aadf --- /dev/null +++ b/libc/src/stdfix/countlsur.cpp @@ -0,0 +1,20 @@ +//===-- Implementation for countlsur function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "countlsur.h" +#include "src/__support/common.h" +#include "src/__support/fixed_point/fx_bits.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, countlsur, (unsigned fract f)) { + return fixed_point::countls(f); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsur.h b/libc/src/stdfix/countlsur.h new file mode 100644 index 0000000000000..1d34e971a52b3 --- /dev/null +++ b/libc/src/stdfix/countlsur.h @@ -0,0 +1,21 @@ +//===-- Implementation header for countlsur function ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H +#define LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H + +#include "include/llvm-libc-macros/stdfix-macros.h" +#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL + +namespace LIBC_NAMESPACE_DECL { + +int countlsur(unsigned fract f); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt index 90d20438edb4b..c8c4fd96bc2b3 100644 --- a/libc/test/src/stdfix/CMakeLists.txt +++ b/libc/test/src/stdfix/CMakeLists.txt @@ -73,6 +73,22 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) libc.src.__support.CPP.bit libc.src.__support.fixed_point.fx_bits ) + + add_libc_test( + countls${suffix}_test + SUITE + libc-stdfix-tests + HDRS + CountlsTest.h + SRCS + countls${suffix}_test.cpp + COMPILE_OPTIONS + -O3 + DEPENDS + libc.src.stdfix.countls${suffix} + libc.src.__support.fixed_point.fx_rep + libc.src.__support.fixed_point.fx_bits + ) endforeach() add_libc_test( diff --git a/libc/test/src/stdfix/CountlsTest.h b/libc/test/src/stdfix/CountlsTest.h new file mode 100644 index 0000000000000..fe3917754a251 --- /dev/null +++ b/libc/test/src/stdfix/CountlsTest.h @@ -0,0 +1,62 @@ +//===-- Utility class to test countls -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/Test.h" + +#include "src/__support/fixed_point/fx_rep.h" + +template class CountlsTest : public LIBC_NAMESPACE::testing::Test { + + using FXRep = LIBC_NAMESPACE::fixed_point::FXRep; + static constexpr T zero = FXRep::ZERO(); + static constexpr T max = FXRep::MAX(); + static constexpr T min = FXRep::MIN(); + static constexpr T one_half = FXRep::ONE_HALF(); + static constexpr T one_fourth = FXRep::ONE_FOURTH(); + static constexpr T eps = FXRep::EPS(); + + static constexpr auto value_len = FXRep::INTEGRAL_LEN + FXRep::FRACTION_LEN; + +public: + typedef int (*CountlsFunc)(T); + + void testSpecialNumbers(CountlsFunc func) { + constexpr bool is_signed = (FXRep::SIGN_LEN > 0); + + EXPECT_EQ(FXRep::INTEGRAL_LEN, func(one_half)); + EXPECT_EQ(FXRep::INTEGRAL_LEN + 1, func(one_fourth)); + EXPECT_EQ(value_len, func(zero)); + EXPECT_EQ(value_len - 1, func(eps)); + EXPECT_EQ(0, func(max)); + // If signed, left shifting the minimum value will overflow, so countls = 0. + // If unsigned, the minimum value is zero, so countls is the number of value + // bits according to ISO/IEC TR 18037. + EXPECT_EQ(is_signed ? 0 : value_len, func(min)); + + if (10 <= static_cast(max)) { + EXPECT_EQ(FXRep::INTEGRAL_LEN - 4, func(10)); + } + + if (static_cast(min) <= -10) { + EXPECT_EQ(FXRep::INTEGRAL_LEN - 4, func(-10)); + } + + if constexpr (is_signed) { + EXPECT_EQ(value_len, func(-eps)); + EXPECT_EQ(FXRep::INTEGRAL_LEN + 1, func(-one_half)); + if (FXRep::FRACTION_LEN >= 2) { + EXPECT_EQ(FXRep::INTEGRAL_LEN + 2, func(-one_fourth)); + } + } + } +}; + +#define LIST_COUNTLS_TESTS(T, func) \ + using LlvmLibcCountlsTest = CountlsTest; \ + TEST_F(LlvmLibcCountlsTest, SpecialNumbers) { testSpecialNumbers(&func); } \ + static_assert(true, "Require semicolon.") diff --git a/libc/test/src/stdfix/countlshk_test.cpp b/libc/test/src/stdfix/countlshk_test.cpp new file mode 100644 index 0000000000000..659f869706b5f --- /dev/null +++ b/libc/test/src/stdfix/countlshk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlshk -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlshk.h" + +LIST_COUNTLS_TESTS(short accum, LIBC_NAMESPACE::countlshk); diff --git a/libc/test/src/stdfix/countlshr_test.cpp b/libc/test/src/stdfix/countlshr_test.cpp new file mode 100644 index 0000000000000..361d4acab3b11 --- /dev/null +++ b/libc/test/src/stdfix/countlshr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlshr -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlshr.h" + +LIST_COUNTLS_TESTS(short fract, LIBC_NAMESPACE::countlshr); diff --git a/libc/test/src/stdfix/countlsk_test.cpp b/libc/test/src/stdfix/countlsk_test.cpp new file mode 100644 index 0000000000000..74cb519ec78de --- /dev/null +++ b/libc/test/src/stdfix/countlsk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsk --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsk.h" + +LIST_COUNTLS_TESTS(accum, LIBC_NAMESPACE::countlsk); diff --git a/libc/test/src/stdfix/countlslk_test.cpp b/libc/test/src/stdfix/countlslk_test.cpp new file mode 100644 index 0000000000000..006939db3c87e --- /dev/null +++ b/libc/test/src/stdfix/countlslk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlslk -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlslk.h" + +LIST_COUNTLS_TESTS(long accum, LIBC_NAMESPACE::countlslk); diff --git a/libc/test/src/stdfix/countlslr_test.cpp b/libc/test/src/stdfix/countlslr_test.cpp new file mode 100644 index 0000000000000..896cf9259c3ea --- /dev/null +++ b/libc/test/src/stdfix/countlslr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlslr -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlslr.h" + +LIST_COUNTLS_TESTS(long fract, LIBC_NAMESPACE::countlslr); diff --git a/libc/test/src/stdfix/countlsr_test.cpp b/libc/test/src/stdfix/countlsr_test.cpp new file mode 100644 index 0000000000000..d7ae91ccd6a92 --- /dev/null +++ b/libc/test/src/stdfix/countlsr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsr --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsr.h" + +LIST_COUNTLS_TESTS(fract, LIBC_NAMESPACE::countlsr); diff --git a/libc/test/src/stdfix/countlsuhk_test.cpp b/libc/test/src/stdfix/countlsuhk_test.cpp new file mode 100644 index 0000000000000..d8e68d65160e7 --- /dev/null +++ b/libc/test/src/stdfix/countlsuhk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsuhk ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsuhk.h" + +LIST_COUNTLS_TESTS(unsigned short accum, LIBC_NAMESPACE::countlsuhk); diff --git a/libc/test/src/stdfix/countlsuhr_test.cpp b/libc/test/src/stdfix/countlsuhr_test.cpp new file mode 100644 index 0000000000000..7dbc590d4a552 --- /dev/null +++ b/libc/test/src/stdfix/countlsuhr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsuhr ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsuhr.h" + +LIST_COUNTLS_TESTS(unsigned short fract, LIBC_NAMESPACE::countlsuhr); diff --git a/libc/test/src/stdfix/countlsuk_test.cpp b/libc/test/src/stdfix/countlsuk_test.cpp new file mode 100644 index 0000000000000..20f78d8c942b6 --- /dev/null +++ b/libc/test/src/stdfix/countlsuk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsuk -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsuk.h" + +LIST_COUNTLS_TESTS(unsigned accum, LIBC_NAMESPACE::countlsuk); diff --git a/libc/test/src/stdfix/countlsulk_test.cpp b/libc/test/src/stdfix/countlsulk_test.cpp new file mode 100644 index 0000000000000..81ae208055cd9 --- /dev/null +++ b/libc/test/src/stdfix/countlsulk_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsulk ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsulk.h" + +LIST_COUNTLS_TESTS(unsigned long accum, LIBC_NAMESPACE::countlsulk); diff --git a/libc/test/src/stdfix/countlsulr_test.cpp b/libc/test/src/stdfix/countlsulr_test.cpp new file mode 100644 index 0000000000000..5b9b047f7fd74 --- /dev/null +++ b/libc/test/src/stdfix/countlsulr_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsulr ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsulr.h" + +LIST_COUNTLS_TESTS(unsigned long fract, LIBC_NAMESPACE::countlsulr); diff --git a/libc/test/src/stdfix/countlsur_test.cpp b/libc/test/src/stdfix/countlsur_test.cpp new file mode 100644 index 0000000000000..67e32d7b56217 --- /dev/null +++ b/libc/test/src/stdfix/countlsur_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for countlsur -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CountlsTest.h" + +#include "src/stdfix/countlsur.h" + +LIST_COUNTLS_TESTS(unsigned fract, LIBC_NAMESPACE::countlsur); From 5566bfa51e9baea0fdcd332198408f8cba39c0d0 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Fri, 7 Feb 2025 10:57:59 -0800 Subject: [PATCH 006/293] [RISCV] Improve RISCVOperand Printing (#126179) We've gradually added more information to the RISCVOperand structure, but the debug output has never caught up, which is quite confusing. This adds printing for many of additional the fields in the structure, where they are relevant. In addition to this, we now have quite a lot of internal registers which share names with each other - e.g. X0_H, X0_W, X0, X0_Pair all have the same name - so also print the enum value to differentiate these. --- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index d050194142a47..ea6ca3b8f9a2d 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1116,18 +1116,21 @@ struct RISCVOperand final : public MCParsedAsmOperand { switch (Kind) { case KindTy::Immediate: - OS << *getImm(); + OS << ""; break; case KindTy::FPImmediate: + OS << ""; break; case KindTy::Register: - OS << ""; + OS << "" : ")>"); break; case KindTy::Token: OS << "'" << getToken() << "'"; break; case KindTy::SystemRegister: - OS << "'; + OS << ""; break; case KindTy::VType: OS << " Date: Fri, 7 Feb 2025 11:01:47 -0800 Subject: [PATCH 007/293] [HLSL][NFC] Add test check to make sure the resource handle gets stored in the resource global (#124866) --- clang/test/CodeGenHLSL/resource-bindings.hlsl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/test/CodeGenHLSL/resource-bindings.hlsl b/clang/test/CodeGenHLSL/resource-bindings.hlsl index bfa7896bd9811..57e8cc29572b1 100644 --- a/clang/test/CodeGenHLSL/resource-bindings.hlsl +++ b/clang/test/CodeGenHLSL/resource-bindings.hlsl @@ -2,14 +2,17 @@ // CHECK: define internal void @_init_resource_U0S0() // CHECK: %U0S0_h = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false) +// CHECK: store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %U0S0_h, ptr @U0S0, align 4 RWBuffer U0S0 : register(u0); // CHECK: define internal void @_init_resource_U5S3() // CHECK: %U5S3_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false) +// CHECK: store target("dx.TypedBuffer", float, 1, 0, 0) %U5S3_h, ptr @U5S3, align 4 RWBuffer U5S3 : register(u5, space3); // CHECK: define internal void @_init_resource_T2S2() // CHECK: %T2S2_h = call target("dx.RawBuffer", i32, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i32_0_0t(i32 2, i32 2, i32 1, i32 0, i1 false) +// CHECK: store target("dx.RawBuffer", i32, 0, 0) %T2S2_h, ptr @T2S2, align 4 StructuredBuffer T2S2 : register(t2, space2); struct S { float4 f; @@ -18,6 +21,7 @@ struct S { // CHECK: define internal void @_init_resource_T3S0() // CHECK: %T3S0_h = call target("dx.RawBuffer", %struct.S, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_s_struct.Ss_0_0t(i32 0, i32 3, i32 1, i32 0, i1 false) +// CHECK: store target("dx.RawBuffer", %struct.S, 0, 0) %T3S0_h, ptr @T3S0, align 4 StructuredBuffer T3S0 : register(t3); // CHECK: define void @main() From 73f11ac17d50f0585d03a880e756ff13c321bfd0 Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Fri, 7 Feb 2025 11:04:09 -0800 Subject: [PATCH 008/293] [mlir][tosa] Use explicit namespace for OpTrait. (#126286) I'm seeing build errors in a downstream project using torch-mlir that are fixed by this change. See https://github.com/iree-org/iree/pull/19903#discussion_r1946899561 for more context. The build error on MSVC is: ``` C:\home\runner\_work\iree\iree\third_party\llvm-project\mlir\include\mlir/Dialect/Tosa/Utils/ConversionUtils.h(148): error C2872: 'OpTrait': ambiguous symbol C:\home\runner\_work\iree\iree\third_party\llvm-project\mlir\include\mlir/Dialect/Tosa/IR/TosaOps.h(49): note: could be 'mlir::OpTrait' C:\home\runner\_work\iree\iree\third_party\torch-mlir\include\torch-mlir/Dialect/Torch/IR/TorchTraits.h(23): note: or 'mlir::torch::Torch::OpTrait' C:\home\runner\_work\iree\iree\third_party\llvm-project\mlir\include\mlir/Dialect/Tosa/Utils/ConversionUtils.h(148): note: the template instantiation context (the oldest one first) is C:\home\runner\_work\iree\iree\third_party\torch-mlir\lib\Conversion\TorchToTosa\TosaLegalizeCommon.cpp(126): note: see reference to function template instantiation 'TosaOp mlir::tosa::CreateOpAndInfer(mlir::PatternRewriter &,mlir::Location,mlir::Type,mlir::Value &,mlir::Value &,mlir::Value &)' being compiled with [ TosaOp=mlir::tosa::MulOp ] C:\home\runner\_work\iree\iree\third_party\torch-mlir\include\torch-mlir/Conversion/TorchToTosa/TosaLegalizeUtils.h(83): note: see reference to function template instantiation 'TosaOp mlir::tosa::CreateOpAndInfer(mlir::ImplicitLocOpBuilder &,mlir::Type,mlir::Value &,mlir::Value &,mlir::Value &)' being compiled with [ TosaOp=mlir::tosa::MulOp ] C:\home\runner\_work\iree\iree\third_party\torch-mlir\include\torch-mlir/Conversion/TorchToTosa/TosaLegalizeUtils.h(76): note: see reference to function template instantiation 'TosaOp mlir::tosa::CreateOpAndInferShape(mlir::ImplicitLocOpBuilder &,mlir::Type,mlir::Value &,mlir::Value &,mlir::Value &)' being compiled with [ TosaOp=mlir::tosa::MulOp ] ``` I think the torch-mlir code here is causing the issue, but I'm not sure why builds only started failing now: https://github.com/llvm/torch-mlir/blob/main/include/torch-mlir/Dialect/Torch/IR/TorchTraits.h. Given that `mlir::OpTrait` already exists, torch-mlir should not be creating an ambiguous symbol `mlir::torch::Torch::OpTrait`. So while a better fix would be to the downstream project, being explicit here doesn't seem that unreasonable to me. --- mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h index 88c2162928652..4e2f1b9cb19a9 100644 --- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h +++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h @@ -145,7 +145,7 @@ TosaOp createOpAndInferShape(ImplicitLocOpBuilder &builder, Type resultTy, template TosaOp CreateOpAndInferShape(ImplicitLocOpBuilder &builder, Type resultTy, Args &&...args) { - if (TosaOp::template hasTrait()) { + if (TosaOp::template hasTrait<::mlir::OpTrait::SameOperandsAndResultRank>()) { // op requires same ranks for tensor operands if constexpr (sizeof...(Args) == 2) { auto argX = std::get<0>(std::tie(args...)); From 4df287a171b63514a5028b85272fcc1b89555ee4 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 7 Feb 2025 20:09:10 +0100 Subject: [PATCH 009/293] [bazel] Add missing dependency for 5a0075adbb623c8661862b9af1272b8f430d9e5c --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 5f7aaf7f8f31b..e07891f004850 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -5224,6 +5224,7 @@ cc_library( ":Support", ":TensorDialect", ":TransformUtils", + ":UBDialect", ":VectorDialect", ":VectorEnumsIncGen", ":VectorInterfaces", From 479ffe851bda03d7707b5aae633f231980b71344 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 7 Feb 2025 13:41:40 -0600 Subject: [PATCH 010/293] [OpenMP] Fix Xarch OpenMP test on Windows Summary: We don't support OpenMP offloading on Windows so this produces weird results. --- clang/test/Driver/offload-Xarch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/Driver/offload-Xarch.c b/clang/test/Driver/offload-Xarch.c index 18c68f2acc884..8856dac198465 100644 --- a/clang/test/Driver/offload-Xarch.c +++ b/clang/test/Driver/offload-Xarch.c @@ -35,7 +35,7 @@ // Make sure that `-Xarch_amdgcn` forwards libraries to the device linker. // RUN: %clang -fopenmp=libomp --offload-arch=gfx90a -nogpulib -nogpuinc \ -// RUN: -Xarch_amdgcn -Wl,-lfoo -### %s 2>&1 \ +// RUN: --target=x86_64-unknown-linux-gnu -Xarch_amdgcn -Wl,-lfoo -### %s 2>&1 \ // RUN: | FileCheck -check-prefix=LIBS %s // RUN: %clang -fopenmp=libomp --offload-arch=gfx90a -nogpulib -nogpuinc \ // RUN: -Xoffload-linker-amdgcn-amd-amdhsa -lfoo -### %s 2>&1 \ From 8a03658d575b5cfd65abb5cd4e80d0ee4163fc11 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Fri, 7 Feb 2025 20:43:05 +0100 Subject: [PATCH 011/293] [mlir][xegpu] Tensor descriptor type verifier (#124548) Adds XeGPU tensor descriptor type verifier. The type verifier covers general tensor descriptor invariants w.r.t. Xe ISA semantics. Related operation verifiers are updated to account for the new descriptor checks and avoid duplication. --- .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 2 +- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 82 ++++++++- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 47 ++--- mlir/test/Dialect/XeGPU/XeGPUOps.mlir | 22 +++ mlir/test/Dialect/XeGPU/invalid.mlir | 169 +++++++++++++++++- 5 files changed, 278 insertions(+), 44 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index d09c5c1870d50..494f11f041b71 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -179,7 +179,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", }]; let hasCustomAssemblyFormat = true; - + let genVerifyDecl = 1; } diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index eb01b15de75c6..becc32d122697 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -175,9 +175,10 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { if (parser.parseGreater()) return {}; - return TensorDescType::get(parser.getContext(), shape, elementType, - encoding.value_or(mlir::Attribute()), - sg_map.value_or(mlir::Attribute())); + return TensorDescType::getChecked( + [&]() { return parser.emitError(parser.getNameLoc()); }, + parser.getContext(), shape, elementType, + encoding.value_or(mlir::Attribute()), sg_map.value_or(mlir::Attribute())); } void TensorDescType::print(::mlir::AsmPrinter &printer) const { @@ -223,6 +224,81 @@ TensorDescType TensorDescType::get(llvm::ArrayRef shape, return Base::get(context, shape, elementType, attr, sg_map); } +LogicalResult TensorDescType::verify( + llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, + llvm::ArrayRef shape, mlir::Type elementType, + mlir::Attribute encoding, mlir::Attribute sg_map) { + size_t rank = shape.size(); + if (rank != 1 && rank != 2) + return emitError() << "expected 1D or 2D tensor"; + + auto scatterAttr = mlir::dyn_cast_if_present(encoding); + if (scatterAttr) { + // Expected tensor ranks for scattered data: + // - 1D tensor for fully non-contiguous elements (chunk size == 1) + // - 2D tensor for scattered blocks (chunk size > 1) + IntegerAttr chunkAttr = scatterAttr.getChunkSize(); + unsigned chunkSize = chunkAttr ? chunkAttr.getInt() : 1; + if (rank == 1 && chunkSize != 1) + return emitError() << "expected non-contiguous elements for 1D tensor"; + if (rank == 2 && chunkSize < 2) + return emitError() << "expected chunk blocks for 2D tensor"; + } + + if (auto blockAttr = + mlir::dyn_cast_if_present(encoding)) { + MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace(); + if (rank == 2 && memorySpaceAttr && + memorySpaceAttr.getValue() == MemorySpace::SLM) + return emitError() << "SLM is not supported for 2D block tensor"; + } + + if (auto sgMapAttr = llvm::dyn_cast_if_present(sg_map)) { + ArrayRef wiLayout = sgMapAttr.getWiLayout(); + ArrayRef wiData = sgMapAttr.getWiData(); + + if (rank == 1) { + if (wiLayout[0] != 1 || wiData[0] != 1) + return emitError() + << "outer layout distribution and data mapping must be 1 " + "for 1D tensor"; + } + + if (scatterAttr) { + // Validate subgroup mapping rules for scattered tensors. + // A work-item's slice of the tensor with shape [sg_size] or + // [sg_size, chunk_size] will be [1] or [1, chunks_size] respectively, + // the mapping should reflect that. + if (wiData[0] != 1) + return emitError() + << "cannot map over non-contiguous scattered row elements"; + + IntegerAttr chunkAttr = scatterAttr.getChunkSize(); + unsigned chunkSize = chunkAttr ? chunkAttr.getInt() : 1; + if (wiData[1] != chunkSize) + return emitError() << "work item data mapping must match the number of " + "contiguous elements"; + } + + // For 1D tensor, pad the shape with an outer unit dimension to allow common + // validation logic. + SmallVector tensorShape(shape.begin(), shape.end()); + if (rank == 1) + tensorShape = {1, tensorShape.back()}; + + size_t dims = tensorShape.size(); + for (size_t i = 0; i < dims; ++i) { + uint32_t numElemPerWi = wiLayout[i] * wiData[i]; + if (tensorShape[i] < numElemPerWi || tensorShape[i] % numElemPerWi != 0) + return emitError() << "cannot distribute " << tensorShape[i] << " over " + << wiLayout[i] << " work items with " << wiData[i] + << " elements each"; + } + } + + return success(); +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index cd883baa986b8..e06d99ac20bb7 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -81,24 +81,28 @@ static bool isWriteHintOrNone(const CachePolicyAttr &attr) { // each dimension. static bool isArgShapesValid(ArrayRef descShape, ArrayRef valShape, SGMapAttr sgMap) { - if (descShape == valShape) { - if (!sgMap) - return true; - - // this can be relaxed if necessary by supporting non-2d shapes distribution - // until the constraints are defined this lives here instead of the tensor - // descriptor type. - return valShape.size() == sgMap.getWiLayout().size(); - } + // Equal shapes with no distribution - no further verification needed. + if (descShape == valShape && !sgMap) + return true; + // Unknown distribution - cannot perform operation on partial shape. if (!sgMap) return false; - if (valShape.size() != descShape.size()) + // Invalid rank or mixed rank usage. + size_t descRank = descShape.size(); + if (descRank > 2 || valShape.size() != descRank) return false; + // For 1D, SG map is guaranteed to be unit size in the outer dimension. + // Only take the distribution over the innermost dimension for validation. + ArrayRef wiLayout = sgMap.getWiLayout(); + SmallVector mapLayout(wiLayout.begin(), wiLayout.end()); + if (descRank == 1) + mapLayout = {wiLayout.back()}; + for (const auto &[factor, dim, expected] : - llvm::zip_equal(sgMap.getWiLayout(), valShape, descShape)) { + llvm::zip_equal(mapLayout, valShape, descShape)) { if (factor * dim != expected) return false; } @@ -227,10 +231,6 @@ LogicalResult CreateNdDescOp::verify() { if (getType().isScattered()) return emitOpError("Expects a non-scattered TensorDesc.\n"); - if (getType().getRank() == 2 && - tdescMemorySpace == static_cast(MemorySpace::SLM)) - return emitOpError("SLM is not supported for 2D Block TensorDesc.\n"); - return success(); } @@ -454,22 +454,7 @@ LogicalResult CreateDescOp::verify() { if (shape != tdescShape) return emitOpError("Incorrect TensorDesc shape. ") << "Expected is " << makeString(shape) << "\n"; - if (auto sgMap = tdescTy.getSGMapAttr()) { - // A work-item's slice of the TensorDesc with shape [sg_size] or - // [sg_size, chunk_size] will be [1] or [1, chunks_size] respectively, - // the mapping should reflect that. - if (sgMap.getWiData()[0] > 1) - return emitOpError("TensorDesc's SG map only supports multiple elements " - "contiguous along rows."); - if (chunkSize != static_cast(sgMap.getWiData()[1])) - return emitOpError( - "TensorDesc's chunkSize must match WI's data mapping."); - if (int rank = tdescTy.getRank(); - (sgMap.getWiLayout()[2 - rank] != tdescShape[0])) - return emitOpError("Detected a conflict between SG map's work-item " - "layout and TensorDesc shape. Check the index of " - "`subgroup_size` in WI layout map."); - } + return success(); } diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir index dcd6b01974cf3..8af1b600ad0a4 100644 --- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -97,6 +97,16 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) { gpu.return } +// CHECK: func @test_load_nd_vc_4(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @test_load_nd_vc_4(%src: memref<24x32xf32>) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.sg_map> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + !xegpu.tensor_desc<32xf32, #xegpu.sg_map> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map> -> vector<2xf32> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32xf32, #xegpu.sg_map> -> vector<2xf32> + gpu.return +} + // CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16> @@ -132,6 +142,18 @@ gpu.func @test_store_nd_vc_3(%src: memref<24x32xf16>) { gpu.return } +// CHECK: func @test_store_nd_vc_4(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @test_store_nd_vc_4(%src: memref<24x32xf16>) { + // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16> + %1 = arith.constant dense<1.0>: vector<2xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16, #xegpu.sg_map> + %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> + !xegpu.tensor_desc<32xf16, #xegpu.sg_map> + // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map> + xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<2xf16>, !xegpu.tensor_desc<32xf16, #xegpu.sg_map> + gpu.return +} + // CHECK: gpu.func @test_create_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) { // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 201f72120cf2c..9162e0012f6d5 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -17,7 +17,7 @@ func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) { // ----- func.func @test_create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) { - // expected-error@+1 {{SLM is not supported for 2D Block TensorDesc}} + // expected-error@+1 {{SLM is not supported for 2D block tensor}} %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } @@ -82,16 +82,33 @@ func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> // expected-error@+1 {{Result shape doesn't match TensorDesc shape.}} - %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> -> vector<8x2xf32> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> + -> vector<8x2xf32> return } // ----- func.func @test_load_nd_vc_5(%src: memref<24x32xf32>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> - !xegpu.tensor_desc<16xf32, #xegpu.sg_map> + !xegpu.tensor_desc<16xf32, #xegpu.sg_map> // expected-error@+1 {{Result shape doesn't match TensorDesc shape.}} - %2 = xegpu.load_nd %1: !xegpu.tensor_desc<16xf32, #xegpu.sg_map> -> vector<16xf32> + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<16xf32, #xegpu.sg_map> + -> vector<8xf32> + return +} + +// ----- +func.func @test_load_nd_vc_6(%src: memref<24x32xf32>) { + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + !xegpu.tensor_desc<8x16xf32> + // expected-error@+1 {{Result shape doesn't match TensorDesc shape.}} + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<8x16xf32> -> vector<8x1xf32> return } @@ -116,6 +133,35 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) { return } +// ----- +func.func @test_store_nd_vc_3(%dst: memref<24x32xf32>, %data: vector<8x2xf32>) { + %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> + !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> + // expected-error@+1 {{Result shape doesn't match TensorDesc shape.}} + xegpu.store_nd %data, %1 + : vector<8x2xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map> + return +} + +// ----- +func.func @test_store_nd_vc_4(%dst: memref<24x32xf32>, %data: vector<2xf32>) { + %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> + !xegpu.tensor_desc<16xf32, #xegpu.sg_map> + // expected-error@+1 {{Result shape doesn't match TensorDesc shape.}} + xegpu.store_nd %data, %1 + : vector<2xf32>, !xegpu.tensor_desc<16xf32, #xegpu.sg_map> + return +} + +// ----- +func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) { + %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> + !xegpu.tensor_desc<8x16xf32> + // expected-error@+1 {{Result shape doesn't match TensorDesc shape.}} + xegpu.store_nd %data, %1 : vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32> + return +} + // ----- func.func @test_update_nd_offset_1(%dst: memref<16xf16>) { %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> @@ -137,8 +183,8 @@ func.func @test_create_tdesc_vc_1(%src: ui64) { // ----- func.func @test_create_tdesc_vc_2(%src: ui64) { %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex> - // expected-error@+1 {{Incorrect TensorDesc shape}} %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> + // expected-error@+1 {{expected chunk blocks for 2D tensor}} -> !xegpu.tensor_desc<8x4xf16, #xegpu.scatter_tdesc_attr<>> return } @@ -173,7 +219,7 @@ func.func @test_prefetch_vc_2(%src: ui64) { // ----- func.func @test_create_tdesc_sg_map_1(%src: ui64) { %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - // expected-error@+1 {{Detected a conflict between SG map's work-item layout and TensorDesc shape. Check the index of `subgroup_size` in WI layout map}} + // expected-error@+1 {{outer layout distribution and data mapping must be 1 for 1D tensor}} %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.sg_map> return } @@ -181,7 +227,7 @@ func.func @test_create_tdesc_sg_map_1(%src: ui64) { // ----- func.func @test_create_tdesc_sg_map_2(%src: ui64) { %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - // expected-error@+1 {{TensorDesc's SG map only supports multiple elements contiguous along rows}} + // expected-error@+1 {{cannot map over non-contiguous scattered row elements}} %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr, #xegpu.sg_map> return } @@ -189,7 +235,7 @@ func.func @test_create_tdesc_sg_map_2(%src: ui64) { // ----- func.func @test_create_tdesc_sg_map_3(%src: ui64) { %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> - // expected-error@+1 {{TensorDesc's chunkSize must match WI's data mapping}} + // expected-error@+1 {{work item data mapping must match the number of contiguous elements}} %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr, #xegpu.sg_map> return } @@ -315,4 +361,109 @@ func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}} xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1>, vector<16x4xf32> -> vector<16x8xf32> return -} \ No newline at end of file +} + +// ----- +func.func @tensor_desc_invalid_rank(%src: memref<24x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // expected-error@+1 {{expected 1D or 2D tensor}} + !xegpu.tensor_desc<16x2x2xf32> + return +} + +// ----- +func.func @tensor_desc_invalid_rank_1(%src: memref<24x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // expected-error@+1 {{expected 1D or 2D tensor}} + !xegpu.tensor_desc + return +} + +// ----- +func.func @tensor_desc_1D_invalid_map_layout(%src: memref<24x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // expected-error@+1 {{outer layout distribution and data mapping must be 1 for 1D tensor}} + !xegpu.tensor_desc<16xf32, #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_1D_invalid_map_data(%src: memref<24x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // expected-error@+1 {{outer layout distribution and data mapping must be 1 for 1D tensor}} + !xegpu.tensor_desc<16xf32, #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_invalid_map_layout(%src: memref<24x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // expected-error@+1 {{cannot distribute 8 over 16 work items with 1 elements each}} + !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_invalid_map_layout_1(%src: memref<24x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // expected-error@+1 {{cannot distribute 4 over 8 work items with 1 elements each}} + !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_invalid_map_data(%src: memref<24x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // expected-error@+1 {{cannot distribute 4 over 2 work items with 4 elements each}} + !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_invalid_map_data_1(%src: memref<24x32xf32>) { + %0 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> + // expected-error@+1 {{cannot distribute 4 over 8 work items with 1 elements each}} + !xegpu.tensor_desc<4x8xf32, #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_scatter_invalid_map_data(%src: ui64) { + %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex> + %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> + // expected-error@+1 {{cannot map over non-contiguous scattered row elements}} + !xegpu.tensor_desc<4x2xf32, + #xegpu.scatter_tdesc_attr, + #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_scatter_invalid_map_data_1(%src: ui64, %offsets: vector<16xindex>) { + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> + // expected-error@+1 {{work item data mapping must match the number of contiguous elements}} + !xegpu.tensor_desc<16xf32, + #xegpu.scatter_tdesc_attr, + #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_scatter_invalid_chunk_size_1D(%src: ui64, %offsets: vector<16xindex>) { + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> + // expected-error@+1 {{expected non-contiguous elements for 1D tensor}} + !xegpu.tensor_desc<16xf32, + #xegpu.scatter_tdesc_attr, + #xegpu.sg_map> + return +} + +// ----- +func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vector<16xindex>) { + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> + // expected-error@+1 {{expected chunk blocks for 2D tensor}} + !xegpu.tensor_desc<16x2xf32, + #xegpu.scatter_tdesc_attr, + #xegpu.sg_map> + return +} From 605a9e37b64b5b5647f1f3e560f7ffe31157841c Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 7 Feb 2025 13:44:20 -0600 Subject: [PATCH 012/293] [OpenMP] Fix infinite loop on recursive initializers (#126269) Summary: If the user tried to initialize a gobal declare target variable with itself the compiler will hang forever. Add a visited set to make sure this stops. Fixes: https://github.com/llvm/llvm-project/issues/69194 --- clang/lib/Sema/SemaOpenMP.cpp | 4 ++++ clang/test/OpenMP/declare_target_messages.cpp | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index b060039d188a1..376995d624e28 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -22819,8 +22819,12 @@ class GlobalDeclRefChecker final : public StmtVisitor { void declareTargetInitializer(Decl *TD) { A = TD->getAttr(); DeclVector.push_back(cast(TD)); + llvm::SmallDenseSet Visited; while (!DeclVector.empty()) { VarDecl *TargetVarDecl = DeclVector.pop_back_val(); + if (!Visited.insert(TargetVarDecl).second) + continue; + if (TargetVarDecl->hasAttr() && TargetVarDecl->hasInit() && TargetVarDecl->hasGlobalStorage()) { if (Expr *Ex = TargetVarDecl->getInit()) diff --git a/clang/test/OpenMP/declare_target_messages.cpp b/clang/test/OpenMP/declare_target_messages.cpp index ce5a833b3866a..3c0e766cf72ca 100644 --- a/clang/test/OpenMP/declare_target_messages.cpp +++ b/clang/test/OpenMP/declare_target_messages.cpp @@ -33,6 +33,11 @@ // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp51,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host-5-and-51,no-host5-and-51 -fopenmp %{limit} -o - %s // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp52,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} %{limit} -o - %s +#pragma omp begin declare target +static int gg; +// expected-warning@+1 {{variable 'recursive' is uninitialized when used within its own initialization}} +int recursive = recursive ^ 3 + gg; +#pragma omp end declare target // expected-error@+1 {{unexpected OpenMP directive '#pragma omp end declare target'}} #pragma omp end declare target From d705e7e9eb5e6d3c791935f191225118b88ab574 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 7 Feb 2025 11:47:49 -0800 Subject: [PATCH 013/293] [NFC][TableGen] Code cleanup in CodeGenMapTable `EmitMapTable` (#126157) - Emit C++17 nested namespaces. - Shorten the binary search table name to just `Table` since its declared in the scope of each search function. - Use `using namespace XXX` in the search function to avoid emitting the Target Inst Namespace prefix in the table entries. - Add short-cut handling of `TableSize` == 0 case (verified in Hexagon target). - Use `SetVector` in `ColFieldValueMap` to get automatic deduplication and eliminate manual deduplication code. - Use range for loops. --- llvm/utils/TableGen/CodeGenMapTable.cpp | 114 ++++++++++-------------- 1 file changed, 49 insertions(+), 65 deletions(-) diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index 8d22c0013dda8..2641e713c0c85 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -78,6 +78,7 @@ #include "Common/CodeGenInstruction.h" #include "Common/CodeGenTarget.h" #include "TableGenBackends.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -361,45 +362,38 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) { StringRef Namespace = Target.getInstNamespace(); ArrayRef ValueCols = InstrMapDesc.getValueCols(); unsigned NumCol = ValueCols.size(); - unsigned TotalNumInstr = NumberedInstructions.size(); unsigned TableSize = 0; - OS << "static const uint16_t " << InstrMapDesc.getName(); + OS << " using namespace " << Namespace << ";\n"; // Number of columns in the table are NumCol+1 because key instructions are // emitted as first column. - OS << "Table[][" << NumCol + 1 << "] = {\n"; - for (unsigned I = 0; I < TotalNumInstr; I++) { - const Record *CurInstr = NumberedInstructions[I]->TheDef; + for (const CodeGenInstruction *Inst : NumberedInstructions) { + const Record *CurInstr = Inst->TheDef; ArrayRef ColInstrs = MapTable[CurInstr]; + if (ColInstrs.empty()) + continue; std::string OutStr; - unsigned RelExists = 0; - if (!ColInstrs.empty()) { - for (unsigned J = 0; J < NumCol; J++) { - if (ColInstrs[J] != nullptr) { - RelExists = 1; - OutStr += ", "; - OutStr += Namespace; - OutStr += "::"; - OutStr += ColInstrs[J]->getName(); - } else { - OutStr += ", (uint16_t)-1U"; - } + bool RelExists = false; + for (const Record *ColInstr : ColInstrs) { + if (ColInstr) { + RelExists = true; + OutStr += ", "; + OutStr += ColInstr->getName(); + } else { + OutStr += ", (uint16_t)-1U"; } + } - if (RelExists) { - OS << " { " << Namespace << "::" << CurInstr->getName(); - OS << OutStr << " },\n"; - TableSize++; - } + if (RelExists) { + if (TableSize == 0) + OS << " static constexpr uint16_t Table[][" << NumCol + 1 << "] = {\n"; + OS << " { " << CurInstr->getName() << OutStr << " },\n"; + ++TableSize; } } - if (!TableSize) { - OS << " { " << Namespace << "::" - << "INSTRUCTION_LIST_END, "; - OS << Namespace << "::" - << "INSTRUCTION_LIST_END }"; - } - OS << "}; // End of " << InstrMapDesc.getName() << "Table\n\n"; + + if (TableSize != 0) + OS << " }; // End of Table\n\n"; return TableSize; } @@ -409,15 +403,19 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) { //===----------------------------------------------------------------------===// void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) { + if (TableSize == 0) { + OS << " return -1;\n"; + return; + } + OS << " unsigned mid;\n"; OS << " unsigned start = 0;\n"; OS << " unsigned end = " << TableSize << ";\n"; OS << " while (start < end) {\n"; OS << " mid = start + (end - start) / 2;\n"; - OS << " if (Opcode == " << InstrMapDesc.getName() << "Table[mid][0]) {\n"; + OS << " if (Opcode == Table[mid][0]) \n"; OS << " break;\n"; - OS << " }\n"; - OS << " if (Opcode < " << InstrMapDesc.getName() << "Table[mid][0])\n"; + OS << " if (Opcode < Table[mid][0])\n"; OS << " end = mid;\n"; OS << " else\n"; OS << " start = mid + 1;\n"; @@ -431,7 +429,6 @@ void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) { //===----------------------------------------------------------------------===// void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) { - const ListInit *ColFields = InstrMapDesc.getColFields(); ArrayRef ValueCols = InstrMapDesc.getValueCols(); @@ -439,6 +436,8 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) { // relation table. If found, return opcode value from the appropriate column // of the table. emitBinSearch(OS, TableSize); + if (TableSize == 0) + return; if (ValueCols.size() > 1) { for (unsigned I = 0, E = ValueCols.size(); I < E; I++) { @@ -453,14 +452,12 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) { OS << " && "; } OS << ")\n"; - OS << " return " << InstrMapDesc.getName(); - OS << "Table[mid][" << I + 1 << "];\n"; + OS << " return Table[mid][" << I + 1 << "];\n"; } OS << " return -1;"; - } else - OS << " return " << InstrMapDesc.getName() << "Table[mid][1];\n"; - - OS << "}\n\n"; + } else { + OS << " return Table[mid][1];\n"; + } } //===----------------------------------------------------------------------===// @@ -468,7 +465,6 @@ void MapTableEmitter::emitMapFuncBody(raw_ostream &OS, unsigned TableSize) { //===----------------------------------------------------------------------===// void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) { - // Emit function name and the input parameters : mostly opcode value of the // current instruction. However, if a table has multiple columns (more than 2 // since first column is used for the key instructions), then we also need @@ -491,6 +487,8 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) { // Emit rest of the function body. emitMapFuncBody(OS, TableSize); + + OS << "}\n\n"; } //===----------------------------------------------------------------------===// @@ -498,7 +496,7 @@ void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) { //===----------------------------------------------------------------------===// static void emitEnums(raw_ostream &OS, const RecordKeeper &Records) { - std::map> ColFieldValueMap; + std::map> ColFieldValueMap; // Iterate over all InstrMapping records and create a map between column // fields and their possible values across all records. @@ -507,10 +505,9 @@ static void emitEnums(raw_ostream &OS, const RecordKeeper &Records) { const ListInit *ColFields = CurMap->getValueAsListInit("ColFields"); const ListInit *List = CurMap->getValueAsListInit("ValueCols"); std::vector ValueCols; - unsigned ListSize = List->size(); - for (unsigned J = 0; J < ListSize; J++) { - const auto *ListJ = cast(List->getElement(J)); + for (const Init *Elem : *List) { + const auto *ListJ = cast(Elem); if (ListJ->size() != ColFields->size()) PrintFatalError("Record `" + CurMap->getName() + @@ -521,37 +518,26 @@ static void emitEnums(raw_ostream &OS, const RecordKeeper &Records) { } for (unsigned J = 0, EndCf = ColFields->size(); J < EndCf; J++) { - for (unsigned K = 0; K < ListSize; K++) { - std::string ColName = ColFields->getElement(J)->getAsUnquotedString(); - ColFieldValueMap[ColName].push_back((ValueCols[K])->getElement(J)); - } + std::string ColName = ColFields->getElement(J)->getAsUnquotedString(); + auto &MapEntry = ColFieldValueMap[ColName]; + for (const ListInit *List : ValueCols) + MapEntry.insert(List->getElement(J)); } } for (auto &[EnumName, FieldValues] : ColFieldValueMap) { - // Delete duplicate entries from ColFieldValueMap - for (unsigned i = 0; i < FieldValues.size() - 1; i++) { - const Init *CurVal = FieldValues[i]; - for (unsigned j = i + 1; j < FieldValues.size(); j++) { - if (CurVal == FieldValues[j]) { - FieldValues.erase(FieldValues.begin() + j); - --j; - } - } - } - // Emit enumerated values for the column fields. OS << "enum " << EnumName << " {\n"; ListSeparator LS(",\n"); for (const Init *Field : FieldValues) - OS << LS << "\t" << EnumName << "_" << Field->getAsUnquotedString(); + OS << LS << " " << EnumName << "_" << Field->getAsUnquotedString(); OS << "\n};\n\n"; } } //===----------------------------------------------------------------------===// // Parse 'InstrMapping' records and use the information to form relationship -// between instructions. These relations are emitted as a tables along with the +// between instructions. These relations are emitted as tables along with the // functions to query them. //===----------------------------------------------------------------------===// void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { @@ -565,8 +551,7 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { OS << "#ifdef GET_INSTRMAP_INFO\n"; OS << "#undef GET_INSTRMAP_INFO\n"; - OS << "namespace llvm {\n\n"; - OS << "namespace " << NameSpace << " {\n\n"; + OS << "namespace llvm::" << NameSpace << " {\n\n"; // Emit coulumn field names and their values as enums. emitEnums(OS, Records); @@ -589,7 +574,6 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { // Emit map tables and the functions to query them. IMap.emitTablesWithFunc(OS); } - OS << "} // end namespace " << NameSpace << "\n"; - OS << "} // end namespace llvm\n"; + OS << "} // end namespace llvm::" << NameSpace << '\n'; OS << "#endif // GET_INSTRMAP_INFO\n\n"; } From 7abca756d1434f0e51cd0af8da647d51f38f634e Mon Sep 17 00:00:00 2001 From: Ben Barham Date: Fri, 7 Feb 2025 12:02:13 -0800 Subject: [PATCH 014/293] [Index] Skip adding call relations to deduction guides (#126151) Deduction guides have no name and we already skip adding occurrences to them for that reason. Also skip adding any relations to them. --- clang/lib/Index/IndexBody.cpp | 3 +++ clang/test/Index/index-deduction-guide.cpp | 10 ++++++++++ 2 files changed, 13 insertions(+) create mode 100644 clang/test/Index/index-deduction-guide.cpp diff --git a/clang/lib/Index/IndexBody.cpp b/clang/lib/Index/IndexBody.cpp index c18daf7faa749..f1dc4d5831ce7 100644 --- a/clang/lib/Index/IndexBody.cpp +++ b/clang/lib/Index/IndexBody.cpp @@ -130,6 +130,9 @@ class BodyIndexer : public RecursiveASTVisitor { void addCallRole(SymbolRoleSet &Roles, SmallVectorImpl &Relations) { + if (isa(ParentDC)) + return; + Roles |= (unsigned)SymbolRole::Call; if (auto *FD = dyn_cast(ParentDC)) Relations.emplace_back((unsigned)SymbolRole::RelationCalledBy, FD); diff --git a/clang/test/Index/index-deduction-guide.cpp b/clang/test/Index/index-deduction-guide.cpp new file mode 100644 index 0000000000000..a29162e8588e8 --- /dev/null +++ b/clang/test/Index/index-deduction-guide.cpp @@ -0,0 +1,10 @@ +// RUN: c-index-test core -print-source-symbols -- %s -std=gnu++17 | FileCheck %s + +template +typename T::type declval() {} +template struct Test; +template ().d())> Test(C &) -> Test; +// CHECK: [[@LINE-1]]:45 | function/C | declval +// CHECK-NOT: RelCall +// CHECK: [[@LINE-3]]:77 | struct(Gen)/C++ | Test +// CHECK: [[@LINE-4]]:64 | struct(Gen)/C++ | Test From bada9220b87e73c0f4a498b82f883e17eda928d1 Mon Sep 17 00:00:00 2001 From: Krishna Pandey <47917477+krishna2803@users.noreply.github.com> Date: Sat, 8 Feb 2025 01:40:18 +0530 Subject: [PATCH 015/293] [libc][stdfix] Fix buildbot failure because of a typo. (#126291) Fix build-bot failure caused by #125356 --- libc/src/__support/fixed_point/fx_bits.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h index 21985e6442534..4b87a9018d670 100644 --- a/libc/src/__support/fixed_point/fx_bits.h +++ b/libc/src/__support/fixed_point/fx_bits.h @@ -187,12 +187,11 @@ countls(T f) { constexpr int CONTAIN_LEN = cpp::numeric_limits::digits; constexpr int PADDING_LEN = CONTAIN_LEN - FXRep::TOTAL_LEN; - if constexpr (FXRep::SIGN_LEN != 0) { - if (x < 0) - x = bit_not(x); - } + if constexpr (FXRep::SIGN_LEN != 0) + if (f < 0) + f = bit_not(f); - BitType value_bits = FXBits(x)::get_bits(); + BitType value_bits = FXBits(f)::get_bits(); return cpp::countl_zero(value_bits) - PADDING_LEN; } From 6ef978b8c41a83378af3de1dceeea434715f80f4 Mon Sep 17 00:00:00 2001 From: QuietMisdreavus Date: Fri, 7 Feb 2025 13:23:10 -0700 Subject: [PATCH 016/293] [clang][ExtractAPI] combine typedef records if the underlying type's name is underscored (#125964) fixes rdar://137214218 When 'typedef struct' decls are encountered, the records are combined if the underlying type is either anonymous or has the same name as the typedef. Extend this behavior to also combine records when the underlying type has an underscored name that is equivalent to the typedef name when the leading underscores are removed. --- .../clang/ExtractAPI/ExtractAPIVisitor.h | 25 ++++++- clang/test/ExtractAPI/typedef_underscore.c | 69 +++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 clang/test/ExtractAPI/typedef_underscore.c diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h index aa86e41806711..e60440e14a9fe 100644 --- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h +++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h @@ -1146,11 +1146,29 @@ bool ExtractAPIVisitorBase::VisitTypedefNameDecl( StringRef Name = Decl->getName(); + auto nameMatches = [&Name](TagDecl *TagDecl) { + StringRef TagName = TagDecl->getName(); + + if (TagName == Name) + return true; + + // Also check whether the tag decl's name is the same as the typedef name + // with prefixed underscores + if (TagName.starts_with('_')) { + StringRef StrippedName = TagName.ltrim('_'); + + if (StrippedName == Name) + return true; + } + + return false; + }; + // If the underlying type was defined as part of the typedef modify it's // fragments directly and pretend the typedef doesn't exist. if (auto *TagDecl = Decl->getUnderlyingType()->getAsTagDecl()) { if (TagDecl->isEmbeddedInDeclarator() && TagDecl->isCompleteDefinition() && - Decl->getName() == TagDecl->getName()) { + nameMatches(TagDecl)) { SmallString<128> TagUSR; index::generateUSRForDecl(TagDecl, TagUSR); if (auto *Record = API.findRecordForUSR(TagUSR)) { @@ -1164,6 +1182,11 @@ bool ExtractAPIVisitorBase::VisitTypedefNameDecl( .append(Name, DeclarationFragments::FragmentKind::Identifier) .appendSemicolon(); + // Replace the name and subheading in case it's underscored so we can + // use the non-underscored version + Record->Name = Name; + Record->SubHeading = DeclarationFragmentsBuilder::getSubHeading(Decl); + return true; } } diff --git a/clang/test/ExtractAPI/typedef_underscore.c b/clang/test/ExtractAPI/typedef_underscore.c new file mode 100644 index 0000000000000..a42046907b46d --- /dev/null +++ b/clang/test/ExtractAPI/typedef_underscore.c @@ -0,0 +1,69 @@ +// RUN: rm -rf %t +// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \ +// RUN: --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain-c.symbols.json -verify +// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \ +// RUN: --product-name=TypedefChain -triple arm64-apple-macosx -x c++-header %s -o %t/typedefchain-cxx.symbols.json -verify + +// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCT +// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCT +typedef struct _MyStruct { } MyStruct; + +// MYSTRUCT-LABEL: "!testLabel": "c:@S@_MyStruct" +// MYSTRUCT: "accessLevel": "public", +// MYSTRUCT: "declarationFragments": [ +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "keyword", +// MYSTRUCT-NEXT: "spelling": "typedef" +// MYSTRUCT-NEXT: }, +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "text", +// MYSTRUCT-NEXT: "spelling": " " +// MYSTRUCT-NEXT: }, +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "keyword", +// MYSTRUCT-NEXT: "spelling": "struct" +// MYSTRUCT-NEXT: }, +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "text", +// MYSTRUCT-NEXT: "spelling": " " +// MYSTRUCT-NEXT: }, +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "identifier", +// MYSTRUCT-NEXT: "spelling": "_MyStruct" +// MYSTRUCT-NEXT: }, +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "text", +// MYSTRUCT-NEXT: "spelling": " { ... } " +// MYSTRUCT-NEXT: }, +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "identifier", +// MYSTRUCT-NEXT: "spelling": "MyStruct" +// MYSTRUCT-NEXT: }, +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "text", +// MYSTRUCT-NEXT: "spelling": ";" +// MYSTRUCT-NEXT: } +// MYSTRUCT-NEXT: ], +// MYSTRUCT: "kind": { +// MYSTRUCT-NEXT: "displayName": "Structure", +// MYSTRUCT-NEXT: "identifier": "c{{(\+\+)?}}.struct" +// MYSTRUCT: "names": { +// MYSTRUCT-NEXT: "navigator": [ +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "identifier", +// MYSTRUCT-NEXT: "spelling": "MyStruct" +// MYSTRUCT-NEXT: } +// MYSTRUCT-NEXT: ], +// MYSTRUCT-NEXT: "subHeading": [ +// MYSTRUCT-NEXT: { +// MYSTRUCT-NEXT: "kind": "identifier", +// MYSTRUCT-NEXT: "spelling": "MyStruct" +// MYSTRUCT-NEXT: } +// MYSTRUCT-NEXT: ], +// MYSTRUCT-NEXT: "title": "MyStruct" +// MYSTRUCT-NEXT: }, +// MYSTRUCT: "pathComponents": [ +// MYSTRUCT-NEXT: "MyStruct" +// MYSTRUCT-NEXT: ] + +// expected-no-diagnostics From 7623d917849daf6d796e497d125e48026b0247cf Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 7 Feb 2025 14:34:00 -0600 Subject: [PATCH 017/293] Revert "[libc][stdfix] Fix buildbot failure because of a typo. (#126291)" This reverts commit bada9220b87e73c0f4a498b82f883e17eda928d1. Revert "[libc][stdfix] Implement fixed point `countlsfx` functions in llvm-libc (#125356)" This reverts commit f2a1103b323492160d7d27a1575fbda709b49036. --- libc/config/baremetal/arm/entrypoints.txt | 12 --- libc/config/baremetal/riscv/entrypoints.txt | 12 --- libc/config/linux/riscv/entrypoints.txt | 12 --- libc/config/linux/x86_64/entrypoints.txt | 12 --- libc/docs/headers/math/stdfix.rst | 2 +- libc/include/stdfix.yaml | 84 ------------------- libc/src/__support/fixed_point/CMakeLists.txt | 1 - libc/src/__support/fixed_point/fx_bits.h | 36 +------- libc/src/stdfix/CMakeLists.txt | 12 --- libc/src/stdfix/countlshk.cpp | 20 ----- libc/src/stdfix/countlshk.h | 21 ----- libc/src/stdfix/countlshr.cpp | 20 ----- libc/src/stdfix/countlshr.h | 21 ----- libc/src/stdfix/countlsk.cpp | 18 ---- libc/src/stdfix/countlsk.h | 21 ----- libc/src/stdfix/countlslk.cpp | 20 ----- libc/src/stdfix/countlslk.h | 21 ----- libc/src/stdfix/countlslr.cpp | 20 ----- libc/src/stdfix/countlslr.h | 21 ----- libc/src/stdfix/countlsr.cpp | 18 ---- libc/src/stdfix/countlsr.h | 21 ----- libc/src/stdfix/countlsuhk.cpp | 20 ----- libc/src/stdfix/countlsuhk.h | 21 ----- libc/src/stdfix/countlsuhr.cpp | 20 ----- libc/src/stdfix/countlsuhr.h | 21 ----- libc/src/stdfix/countlsuk.cpp | 20 ----- libc/src/stdfix/countlsuk.h | 21 ----- libc/src/stdfix/countlsulk.cpp | 20 ----- libc/src/stdfix/countlsulk.h | 21 ----- libc/src/stdfix/countlsulr.cpp | 20 ----- libc/src/stdfix/countlsulr.h | 21 ----- libc/src/stdfix/countlsur.cpp | 20 ----- libc/src/stdfix/countlsur.h | 21 ----- libc/test/src/stdfix/CMakeLists.txt | 16 ---- libc/test/src/stdfix/CountlsTest.h | 62 -------------- libc/test/src/stdfix/countlshk_test.cpp | 13 --- libc/test/src/stdfix/countlshr_test.cpp | 13 --- libc/test/src/stdfix/countlsk_test.cpp | 13 --- libc/test/src/stdfix/countlslk_test.cpp | 13 --- libc/test/src/stdfix/countlslr_test.cpp | 13 --- libc/test/src/stdfix/countlsr_test.cpp | 13 --- libc/test/src/stdfix/countlsuhk_test.cpp | 13 --- libc/test/src/stdfix/countlsuhr_test.cpp | 13 --- libc/test/src/stdfix/countlsuk_test.cpp | 13 --- libc/test/src/stdfix/countlsulk_test.cpp | 13 --- libc/test/src/stdfix/countlsulr_test.cpp | 13 --- libc/test/src/stdfix/countlsur_test.cpp | 13 --- 47 files changed, 3 insertions(+), 902 deletions(-) delete mode 100644 libc/src/stdfix/countlshk.cpp delete mode 100644 libc/src/stdfix/countlshk.h delete mode 100644 libc/src/stdfix/countlshr.cpp delete mode 100644 libc/src/stdfix/countlshr.h delete mode 100644 libc/src/stdfix/countlsk.cpp delete mode 100644 libc/src/stdfix/countlsk.h delete mode 100644 libc/src/stdfix/countlslk.cpp delete mode 100644 libc/src/stdfix/countlslk.h delete mode 100644 libc/src/stdfix/countlslr.cpp delete mode 100644 libc/src/stdfix/countlslr.h delete mode 100644 libc/src/stdfix/countlsr.cpp delete mode 100644 libc/src/stdfix/countlsr.h delete mode 100644 libc/src/stdfix/countlsuhk.cpp delete mode 100644 libc/src/stdfix/countlsuhk.h delete mode 100644 libc/src/stdfix/countlsuhr.cpp delete mode 100644 libc/src/stdfix/countlsuhr.h delete mode 100644 libc/src/stdfix/countlsuk.cpp delete mode 100644 libc/src/stdfix/countlsuk.h delete mode 100644 libc/src/stdfix/countlsulk.cpp delete mode 100644 libc/src/stdfix/countlsulk.h delete mode 100644 libc/src/stdfix/countlsulr.cpp delete mode 100644 libc/src/stdfix/countlsulr.h delete mode 100644 libc/src/stdfix/countlsur.cpp delete mode 100644 libc/src/stdfix/countlsur.h delete mode 100644 libc/test/src/stdfix/CountlsTest.h delete mode 100644 libc/test/src/stdfix/countlshk_test.cpp delete mode 100644 libc/test/src/stdfix/countlshr_test.cpp delete mode 100644 libc/test/src/stdfix/countlsk_test.cpp delete mode 100644 libc/test/src/stdfix/countlslk_test.cpp delete mode 100644 libc/test/src/stdfix/countlslr_test.cpp delete mode 100644 libc/test/src/stdfix/countlsr_test.cpp delete mode 100644 libc/test/src/stdfix/countlsuhk_test.cpp delete mode 100644 libc/test/src/stdfix/countlsuhr_test.cpp delete mode 100644 libc/test/src/stdfix/countlsuk_test.cpp delete mode 100644 libc/test/src/stdfix/countlsulk_test.cpp delete mode 100644 libc/test/src/stdfix/countlsulr_test.cpp delete mode 100644 libc/test/src/stdfix/countlsur_test.cpp diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index 351f727389e3a..694cd7b1993ca 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -469,18 +469,6 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits - libc.src.stdfix.countlshr - libc.src.stdfix.countlsr - libc.src.stdfix.countlslr - libc.src.stdfix.countlshk - libc.src.stdfix.countlsk - libc.src.stdfix.countlslk - libc.src.stdfix.countlsuhr - libc.src.stdfix.countlsur - libc.src.stdfix.countlsulr - libc.src.stdfix.countlsuhk - libc.src.stdfix.countlsuk - libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index 39c70a22a21e0..667ab40dca999 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -464,18 +464,6 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits - libc.src.stdfix.countlshr - libc.src.stdfix.countlsr - libc.src.stdfix.countlslr - libc.src.stdfix.countlshk - libc.src.stdfix.countlsk - libc.src.stdfix.countlslk - libc.src.stdfix.countlsuhr - libc.src.stdfix.countlsur - libc.src.stdfix.countlsulr - libc.src.stdfix.countlsuhk - libc.src.stdfix.countlsuk - libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index a9ba0c257755b..6e67ea559d57b 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -749,18 +749,6 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) # TODO: https://github.com/llvm/llvm-project/issues/115778 libc.src.stdfix.lkbits libc.src.stdfix.ulkbits - libc.src.stdfix.countlshr - libc.src.stdfix.countlsr - libc.src.stdfix.countlslr - libc.src.stdfix.countlshk - libc.src.stdfix.countlsk - libc.src.stdfix.countlslk - libc.src.stdfix.countlsuhr - libc.src.stdfix.countlsur - libc.src.stdfix.countlsulr - libc.src.stdfix.countlsuhk - libc.src.stdfix.countlsuk - libc.src.stdfix.countlsulk ) endif() diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 2a4c17a56f377..81dceb74a1774 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -874,18 +874,6 @@ if(LIBC_COMPILER_HAS_FIXED_POINT) libc.src.stdfix.ukbits libc.src.stdfix.lkbits libc.src.stdfix.ulkbits - libc.src.stdfix.countlshr - libc.src.stdfix.countlsr - libc.src.stdfix.countlslr - libc.src.stdfix.countlshk - libc.src.stdfix.countlsk - libc.src.stdfix.countlslk - libc.src.stdfix.countlsuhr - libc.src.stdfix.countlsur - libc.src.stdfix.countlsulr - libc.src.stdfix.countlsuhk - libc.src.stdfix.countlsuk - libc.src.stdfix.countlsulk ) endif() diff --git a/libc/docs/headers/math/stdfix.rst b/libc/docs/headers/math/stdfix.rst index 4507f2b608bf1..58052f000995c 100644 --- a/libc/docs/headers/math/stdfix.rst +++ b/libc/docs/headers/math/stdfix.rst @@ -73,7 +73,7 @@ The following functions are included in the ISO/IEC TR 18037:2008 standard. +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ | \*bits | | | | | | | | | | | | | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ -| countls | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | |check| | +| countls | | | | | | | | | | | | | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ | divi | | | | | | | | | | | | | +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+ diff --git a/libc/include/stdfix.yaml b/libc/include/stdfix.yaml index 0abf2f3a9b3b6..9663ac0c7df4d 100644 --- a/libc/include/stdfix.yaml +++ b/libc/include/stdfix.yaml @@ -306,87 +306,3 @@ functions: arguments: - type: unsigned int guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlshr - standards: - - stdc_ext - return_type: int - arguments: - - type: short fract - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlsr - standards: - - stdc_ext - return_type: int - arguments: - - type: fract - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlslr - standards: - - stdc_ext - return_type: int - arguments: - - type: long fract - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlshk - standards: - - stdc_ext - return_type: int - arguments: - - type: short accum - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlsk - standards: - - stdc_ext - return_type: int - arguments: - - type: accum - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlslk - standards: - - stdc_ext - return_type: int - arguments: - - type: long accum - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlsuhr - standards: - - stdc_ext - return_type: int - arguments: - - type: unsigned short fract - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlsur - standards: - - stdc_ext - return_type: int - arguments: - - type: unsigned fract - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlsulr - standards: - - stdc_ext - return_type: int - arguments: - - type: unsigned long fract - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlsuhk - standards: - - stdc_ext - return_type: int - arguments: - - type: unsigned short accum - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlsuk - standards: - - stdc_ext - return_type: int - arguments: - - type: unsigned accum - guard: LIBC_COMPILER_HAS_FIXED_POINT - - name: countlsulk - standards: - - stdc_ext - return_type: int - arguments: - - type: unsigned long accum - guard: LIBC_COMPILER_HAS_FIXED_POINT diff --git a/libc/src/__support/fixed_point/CMakeLists.txt b/libc/src/__support/fixed_point/CMakeLists.txt index b415e2c00c488..3b744081765e4 100644 --- a/libc/src/__support/fixed_point/CMakeLists.txt +++ b/libc/src/__support/fixed_point/CMakeLists.txt @@ -19,7 +19,6 @@ add_header_library( libc.src.__support.macros.optimization libc.src.__support.CPP.type_traits libc.src.__support.CPP.bit - libc.src.__support.CPP.limits libc.src.__support.math_extras ) diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h index 4b87a9018d670..225ea417760a0 100644 --- a/libc/src/__support/fixed_point/fx_bits.h +++ b/libc/src/__support/fixed_point/fx_bits.h @@ -11,10 +11,9 @@ #include "include/llvm-libc-macros/stdfix-macros.h" #include "src/__support/CPP/bit.h" -#include "src/__support/CPP/limits.h" // numeric_limits #include "src/__support/CPP/type_traits.h" -#include "src/__support/macros/attributes.h" // LIBC_INLINE -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL +#include "src/__support/macros/attributes.h" // LIBC_INLINE +#include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/math_extras.h" @@ -51,12 +50,6 @@ template struct FXBits { static constexpr StorageType SIGN_MASK = (fx_rep::SIGN_LEN == 0 ? 0 : StorageType(1) << SIGN_OFFSET); - // mask for - static constexpr StorageType VALUE_MASK = INTEGRAL_MASK | FRACTION_MASK; - - // mask for - static constexpr StorageType TOTAL_MASK = SIGN_MASK | VALUE_MASK; - public: LIBC_INLINE constexpr FXBits() = default; @@ -81,12 +74,6 @@ template struct FXBits { return (value & INTEGRAL_MASK) >> INTEGRAL_OFFSET; } - // returns complete bitstring representation the fixed point number - // the bitstring is of the form: padding | sign | integral | fraction - LIBC_INLINE constexpr StorageType get_bits() { - return (value & TOTAL_MASK) >> FRACTION_OFFSET; - } - // TODO: replace bool with Sign LIBC_INLINE constexpr bool get_sign() { return static_cast((value & SIGN_MASK) >> SIGN_OFFSET); @@ -176,25 +163,6 @@ template LIBC_INLINE constexpr T round(T x, int n) { return bit_and((x + round_bit), rounding_mask); } -// count leading sign bits -template -LIBC_INLINE constexpr cpp::enable_if_t, int> -countls(T f) { - using FXRep = FXRep; - using BitType = typename FXRep::StorageType; - using FXBits = FXBits; - - constexpr int CONTAIN_LEN = cpp::numeric_limits::digits; - constexpr int PADDING_LEN = CONTAIN_LEN - FXRep::TOTAL_LEN; - - if constexpr (FXRep::SIGN_LEN != 0) - if (f < 0) - f = bit_not(f); - - BitType value_bits = FXBits(f)::get_bits(); - return cpp::countl_zero(value_bits) - PADDING_LEN; -} - } // namespace fixed_point } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/CMakeLists.txt b/libc/src/stdfix/CMakeLists.txt index 3f6f9125a086b..815f739d23efa 100644 --- a/libc/src/stdfix/CMakeLists.txt +++ b/libc/src/stdfix/CMakeLists.txt @@ -53,18 +53,6 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) libc.src.__support.CPP.bit libc.src.__support.fixed_point.fx_bits ) - - add_entrypoint_object( - countls${suffix} - HDRS - countls${suffix}.h - SRCS - countls${suffix}.cpp - COMPILE_OPTIONS - ${libc_opt_high_flag} - DEPENDS - libc.src.__support.fixed_point.fx_bits - ) endforeach() add_entrypoint_object( diff --git a/libc/src/stdfix/countlshk.cpp b/libc/src/stdfix/countlshk.cpp deleted file mode 100644 index f94728beff1cb..0000000000000 --- a/libc/src/stdfix/countlshk.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlshk function ----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlshk.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlshk, (short accum f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlshk.h b/libc/src/stdfix/countlshk.h deleted file mode 100644 index ab334244e166a..0000000000000 --- a/libc/src/stdfix/countlshk.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlshk function ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlshk(short accum f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSHK_H diff --git a/libc/src/stdfix/countlshr.cpp b/libc/src/stdfix/countlshr.cpp deleted file mode 100644 index d77d3e9a3c22a..0000000000000 --- a/libc/src/stdfix/countlshr.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlshr function ----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlshr.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlshr, (short fract f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlshr.h b/libc/src/stdfix/countlshr.h deleted file mode 100644 index 579b7b680406e..0000000000000 --- a/libc/src/stdfix/countlshr.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlshr function ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlshr(short fract f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSHR_H diff --git a/libc/src/stdfix/countlsk.cpp b/libc/src/stdfix/countlsk.cpp deleted file mode 100644 index b6f56adee16a6..0000000000000 --- a/libc/src/stdfix/countlsk.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===-- Implementation for countlsk function -----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlsk.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlsk, (accum f)) { return fixed_point::countls(f); } - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsk.h b/libc/src/stdfix/countlsk.h deleted file mode 100644 index d0c893bc078d5..0000000000000 --- a/libc/src/stdfix/countlsk.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlsk function -------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSK_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSK_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlsk(accum f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSK_H diff --git a/libc/src/stdfix/countlslk.cpp b/libc/src/stdfix/countlslk.cpp deleted file mode 100644 index 9bf30ff34c6ee..0000000000000 --- a/libc/src/stdfix/countlslk.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlslk function ----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlslk.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlslk, (long accum f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlslk.h b/libc/src/stdfix/countlslk.h deleted file mode 100644 index 60fa469797b7a..0000000000000 --- a/libc/src/stdfix/countlslk.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlslk function ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlslk(long accum f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSLK_H diff --git a/libc/src/stdfix/countlslr.cpp b/libc/src/stdfix/countlslr.cpp deleted file mode 100644 index 774023c734a37..0000000000000 --- a/libc/src/stdfix/countlslr.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlslr function ----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlslr.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlslr, (long fract f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlslr.h b/libc/src/stdfix/countlslr.h deleted file mode 100644 index c909551e77a1a..0000000000000 --- a/libc/src/stdfix/countlslr.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlslr function ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlslr(long fract f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSLR_H diff --git a/libc/src/stdfix/countlsr.cpp b/libc/src/stdfix/countlsr.cpp deleted file mode 100644 index 14563127ad5e9..0000000000000 --- a/libc/src/stdfix/countlsr.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===-- Implementation for countlsr function -----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlsr.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlsr, (fract f)) { return fixed_point::countls(f); } - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsr.h b/libc/src/stdfix/countlsr.h deleted file mode 100644 index 75dcf4aff0ca3..0000000000000 --- a/libc/src/stdfix/countlsr.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlsr function -------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSR_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSR_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlsr(fract f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSR_H diff --git a/libc/src/stdfix/countlsuhk.cpp b/libc/src/stdfix/countlsuhk.cpp deleted file mode 100644 index 2cc266f47da1f..0000000000000 --- a/libc/src/stdfix/countlsuhk.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlsuhk function ---------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlsuhk.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlsuhk, (unsigned short accum f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuhk.h b/libc/src/stdfix/countlsuhk.h deleted file mode 100644 index fcb2fec3500d4..0000000000000 --- a/libc/src/stdfix/countlsuhk.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlsuhk function -----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlsuhk(unsigned short accum f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUHK_H diff --git a/libc/src/stdfix/countlsuhr.cpp b/libc/src/stdfix/countlsuhr.cpp deleted file mode 100644 index f30b0dd731aa9..0000000000000 --- a/libc/src/stdfix/countlsuhr.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlsuhr function ---------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlsuhr.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlsuhr, (unsigned short fract f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuhr.h b/libc/src/stdfix/countlsuhr.h deleted file mode 100644 index c6ce001d38b11..0000000000000 --- a/libc/src/stdfix/countlsuhr.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlsuhr function -----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlsuhr(unsigned long fract f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUHR_H diff --git a/libc/src/stdfix/countlsuk.cpp b/libc/src/stdfix/countlsuk.cpp deleted file mode 100644 index 3f32ba0815b6e..0000000000000 --- a/libc/src/stdfix/countlsuk.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlsuhk function ---------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlsuhk.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlsuhk, (unsigned accum f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsuk.h b/libc/src/stdfix/countlsuk.h deleted file mode 100644 index 7ad0e701b927b..0000000000000 --- a/libc/src/stdfix/countlsuk.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlsuk function ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlsuk(unsigned accum f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUK_H diff --git a/libc/src/stdfix/countlsulk.cpp b/libc/src/stdfix/countlsulk.cpp deleted file mode 100644 index 04090dd86c732..0000000000000 --- a/libc/src/stdfix/countlsulk.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlsulk function ---------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlsulk.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlsulk, (unsigned long accum f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsulk.h b/libc/src/stdfix/countlsulk.h deleted file mode 100644 index 55ca9d2e20ff0..0000000000000 --- a/libc/src/stdfix/countlsulk.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlsulk function -----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlsulk(unsigned long accum f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSULK_H diff --git a/libc/src/stdfix/countlsulr.cpp b/libc/src/stdfix/countlsulr.cpp deleted file mode 100644 index d9d6ff404c211..0000000000000 --- a/libc/src/stdfix/countlsulr.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlsulr function ---------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlsulr.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlsulr, (unsigned long fract f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsulr.h b/libc/src/stdfix/countlsulr.h deleted file mode 100644 index 59e7d726d01b9..0000000000000 --- a/libc/src/stdfix/countlsulr.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlsulr function -----------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlsulr(unsigned long fract f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSULR_H diff --git a/libc/src/stdfix/countlsur.cpp b/libc/src/stdfix/countlsur.cpp deleted file mode 100644 index 777e5f387aadf..0000000000000 --- a/libc/src/stdfix/countlsur.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===-- Implementation for countlsur function ----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "countlsur.h" -#include "src/__support/common.h" -#include "src/__support/fixed_point/fx_bits.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -LLVM_LIBC_FUNCTION(int, countlsur, (unsigned fract f)) { - return fixed_point::countls(f); -} - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/stdfix/countlsur.h b/libc/src/stdfix/countlsur.h deleted file mode 100644 index 1d34e971a52b3..0000000000000 --- a/libc/src/stdfix/countlsur.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-- Implementation header for countlsur function ------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H -#define LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H - -#include "include/llvm-libc-macros/stdfix-macros.h" -#include "src/__support/macros/config.h" // LIBC_NAMESPACE_DECL - -namespace LIBC_NAMESPACE_DECL { - -int countlsur(unsigned fract f); - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STDFIX_COUNTLSUR_H diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt index c8c4fd96bc2b3..90d20438edb4b 100644 --- a/libc/test/src/stdfix/CMakeLists.txt +++ b/libc/test/src/stdfix/CMakeLists.txt @@ -73,22 +73,6 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) libc.src.__support.CPP.bit libc.src.__support.fixed_point.fx_bits ) - - add_libc_test( - countls${suffix}_test - SUITE - libc-stdfix-tests - HDRS - CountlsTest.h - SRCS - countls${suffix}_test.cpp - COMPILE_OPTIONS - -O3 - DEPENDS - libc.src.stdfix.countls${suffix} - libc.src.__support.fixed_point.fx_rep - libc.src.__support.fixed_point.fx_bits - ) endforeach() add_libc_test( diff --git a/libc/test/src/stdfix/CountlsTest.h b/libc/test/src/stdfix/CountlsTest.h deleted file mode 100644 index fe3917754a251..0000000000000 --- a/libc/test/src/stdfix/CountlsTest.h +++ /dev/null @@ -1,62 +0,0 @@ -//===-- Utility class to test countls -------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "test/UnitTest/Test.h" - -#include "src/__support/fixed_point/fx_rep.h" - -template class CountlsTest : public LIBC_NAMESPACE::testing::Test { - - using FXRep = LIBC_NAMESPACE::fixed_point::FXRep; - static constexpr T zero = FXRep::ZERO(); - static constexpr T max = FXRep::MAX(); - static constexpr T min = FXRep::MIN(); - static constexpr T one_half = FXRep::ONE_HALF(); - static constexpr T one_fourth = FXRep::ONE_FOURTH(); - static constexpr T eps = FXRep::EPS(); - - static constexpr auto value_len = FXRep::INTEGRAL_LEN + FXRep::FRACTION_LEN; - -public: - typedef int (*CountlsFunc)(T); - - void testSpecialNumbers(CountlsFunc func) { - constexpr bool is_signed = (FXRep::SIGN_LEN > 0); - - EXPECT_EQ(FXRep::INTEGRAL_LEN, func(one_half)); - EXPECT_EQ(FXRep::INTEGRAL_LEN + 1, func(one_fourth)); - EXPECT_EQ(value_len, func(zero)); - EXPECT_EQ(value_len - 1, func(eps)); - EXPECT_EQ(0, func(max)); - // If signed, left shifting the minimum value will overflow, so countls = 0. - // If unsigned, the minimum value is zero, so countls is the number of value - // bits according to ISO/IEC TR 18037. - EXPECT_EQ(is_signed ? 0 : value_len, func(min)); - - if (10 <= static_cast(max)) { - EXPECT_EQ(FXRep::INTEGRAL_LEN - 4, func(10)); - } - - if (static_cast(min) <= -10) { - EXPECT_EQ(FXRep::INTEGRAL_LEN - 4, func(-10)); - } - - if constexpr (is_signed) { - EXPECT_EQ(value_len, func(-eps)); - EXPECT_EQ(FXRep::INTEGRAL_LEN + 1, func(-one_half)); - if (FXRep::FRACTION_LEN >= 2) { - EXPECT_EQ(FXRep::INTEGRAL_LEN + 2, func(-one_fourth)); - } - } - } -}; - -#define LIST_COUNTLS_TESTS(T, func) \ - using LlvmLibcCountlsTest = CountlsTest; \ - TEST_F(LlvmLibcCountlsTest, SpecialNumbers) { testSpecialNumbers(&func); } \ - static_assert(true, "Require semicolon.") diff --git a/libc/test/src/stdfix/countlshk_test.cpp b/libc/test/src/stdfix/countlshk_test.cpp deleted file mode 100644 index 659f869706b5f..0000000000000 --- a/libc/test/src/stdfix/countlshk_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlshk -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlshk.h" - -LIST_COUNTLS_TESTS(short accum, LIBC_NAMESPACE::countlshk); diff --git a/libc/test/src/stdfix/countlshr_test.cpp b/libc/test/src/stdfix/countlshr_test.cpp deleted file mode 100644 index 361d4acab3b11..0000000000000 --- a/libc/test/src/stdfix/countlshr_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlshr -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlshr.h" - -LIST_COUNTLS_TESTS(short fract, LIBC_NAMESPACE::countlshr); diff --git a/libc/test/src/stdfix/countlsk_test.cpp b/libc/test/src/stdfix/countlsk_test.cpp deleted file mode 100644 index 74cb519ec78de..0000000000000 --- a/libc/test/src/stdfix/countlsk_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlsk --------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlsk.h" - -LIST_COUNTLS_TESTS(accum, LIBC_NAMESPACE::countlsk); diff --git a/libc/test/src/stdfix/countlslk_test.cpp b/libc/test/src/stdfix/countlslk_test.cpp deleted file mode 100644 index 006939db3c87e..0000000000000 --- a/libc/test/src/stdfix/countlslk_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlslk -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlslk.h" - -LIST_COUNTLS_TESTS(long accum, LIBC_NAMESPACE::countlslk); diff --git a/libc/test/src/stdfix/countlslr_test.cpp b/libc/test/src/stdfix/countlslr_test.cpp deleted file mode 100644 index 896cf9259c3ea..0000000000000 --- a/libc/test/src/stdfix/countlslr_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlslr -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlslr.h" - -LIST_COUNTLS_TESTS(long fract, LIBC_NAMESPACE::countlslr); diff --git a/libc/test/src/stdfix/countlsr_test.cpp b/libc/test/src/stdfix/countlsr_test.cpp deleted file mode 100644 index d7ae91ccd6a92..0000000000000 --- a/libc/test/src/stdfix/countlsr_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlsr --------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlsr.h" - -LIST_COUNTLS_TESTS(fract, LIBC_NAMESPACE::countlsr); diff --git a/libc/test/src/stdfix/countlsuhk_test.cpp b/libc/test/src/stdfix/countlsuhk_test.cpp deleted file mode 100644 index d8e68d65160e7..0000000000000 --- a/libc/test/src/stdfix/countlsuhk_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlsuhk ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlsuhk.h" - -LIST_COUNTLS_TESTS(unsigned short accum, LIBC_NAMESPACE::countlsuhk); diff --git a/libc/test/src/stdfix/countlsuhr_test.cpp b/libc/test/src/stdfix/countlsuhr_test.cpp deleted file mode 100644 index 7dbc590d4a552..0000000000000 --- a/libc/test/src/stdfix/countlsuhr_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlsuhr ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlsuhr.h" - -LIST_COUNTLS_TESTS(unsigned short fract, LIBC_NAMESPACE::countlsuhr); diff --git a/libc/test/src/stdfix/countlsuk_test.cpp b/libc/test/src/stdfix/countlsuk_test.cpp deleted file mode 100644 index 20f78d8c942b6..0000000000000 --- a/libc/test/src/stdfix/countlsuk_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlsuk -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlsuk.h" - -LIST_COUNTLS_TESTS(unsigned accum, LIBC_NAMESPACE::countlsuk); diff --git a/libc/test/src/stdfix/countlsulk_test.cpp b/libc/test/src/stdfix/countlsulk_test.cpp deleted file mode 100644 index 81ae208055cd9..0000000000000 --- a/libc/test/src/stdfix/countlsulk_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlsulk ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlsulk.h" - -LIST_COUNTLS_TESTS(unsigned long accum, LIBC_NAMESPACE::countlsulk); diff --git a/libc/test/src/stdfix/countlsulr_test.cpp b/libc/test/src/stdfix/countlsulr_test.cpp deleted file mode 100644 index 5b9b047f7fd74..0000000000000 --- a/libc/test/src/stdfix/countlsulr_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlsulr ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlsulr.h" - -LIST_COUNTLS_TESTS(unsigned long fract, LIBC_NAMESPACE::countlsulr); diff --git a/libc/test/src/stdfix/countlsur_test.cpp b/libc/test/src/stdfix/countlsur_test.cpp deleted file mode 100644 index 67e32d7b56217..0000000000000 --- a/libc/test/src/stdfix/countlsur_test.cpp +++ /dev/null @@ -1,13 +0,0 @@ -//===-- Unittests for countlsur -------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "CountlsTest.h" - -#include "src/stdfix/countlsur.h" - -LIST_COUNTLS_TESTS(unsigned fract, LIBC_NAMESPACE::countlsur); From dee20925e76e9aed50c0fd4ac36a9ee1039a5265 Mon Sep 17 00:00:00 2001 From: Vinay Deshmukh <32487576+vinay-deshmukh@users.noreply.github.com> Date: Fri, 7 Feb 2025 15:50:26 -0500 Subject: [PATCH 018/293] [libc][test] `-Wimplicit-fallthrough`, `-Wwrite-strings` and non-GCC warnings (#124036) * Enabled `-Wimplicit-fallthrough` * Enabled `-Wwrite-strings` * Enabled non-GCC warnings * Fix non-GCC to mean `clang` * Move `-Wstrict-prototypes` to common section See: https://github.com/llvm/llvm-project/pull/122835#discussion_r1924109264 Relates to #119281 --- libc/cmake/modules/LLVMLibCTestRules.cmake | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 10bb9c9487d63..5709a11b1a201 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -38,9 +38,8 @@ function(_get_common_test_compile_options output_var c_test flags) endif() # list(APPEND compile_options "-Wconversion") # list(APPEND compile_options "-Wno-sign-conversion") - # list(APPEND compile_options "-Wimplicit-fallthrough") - # list(APPEND compile_options "-Wwrite-strings") - list(APPEND compile_options "-Wextra-semi") + list(APPEND compile_options "-Wimplicit-fallthrough") + list(APPEND compile_options "-Wwrite-strings") # Silence this warning because _Complex is a part of C99. if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(NOT c_test) @@ -51,13 +50,14 @@ function(_get_common_test_compile_options output_var c_test flags) list(APPEND compile_options "-Wno-gnu-imaginary-constant") endif() list(APPEND compile_options "-Wno-pedantic") - # if(NOT CMAKE_COMPILER_IS_GNUCXX) - # list(APPEND compile_options "-Wnewline-eof") - # list(APPEND compile_options "-Wnonportable-system-include-path") - # list(APPEND compile_options "-Wstrict-prototypes") - # list(APPEND compile_options "-Wthread-safety") - # list(APPEND compile_options "-Wglobal-constructors") - # endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + list(APPEND compile_options "-Wstrict-prototypes") + list(APPEND compile_options "-Wextra-semi") + list(APPEND compile_options "-Wnewline-eof") + list(APPEND compile_options "-Wnonportable-system-include-path") + list(APPEND compile_options "-Wthread-safety") + # list(APPEND compile_options "-Wglobal-constructors") + endif() endif() set(${output_var} ${compile_options} PARENT_SCOPE) endfunction() From cea799afc632879f7d08157108a60c67317829a2 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 7 Feb 2025 20:50:46 +0000 Subject: [PATCH 019/293] [LV] Add ordered reduction test with live-in. Extra test for https://github.com/llvm/llvm-project/pull/124644. --- .../strict-fadd-interleave-only.ll | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll index 9e7c667f9c8ad..760bdbf227fff 100644 --- a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll +++ b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll @@ -217,3 +217,94 @@ exit: %.lcssa = phi float [ %rdx.next, %loop ] ret float %.lcssa } + +define float @fadd_reduction_with_live_in(float %inc) { +; CHECK-LABEL: define float @fadd_reduction_with_live_in( +; CHECK-SAME: float [[INC:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[VEC_IV1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i32 [[VEC_IV]], 1000 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i32 [[VEC_IV1]], 1000 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP0]], float [[INC]], float -0.000000e+00 +; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[VEC_PHI]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP1]], float [[INC]], float -0.000000e+00 +; CHECK-NEXT: [[TMP5]] = fadd float [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1002 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1002, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SUM_NEXT]] = fadd float [[SUM]], [[INC]] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[LCSSA:%.*]] = phi float [ [[SUM_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[LCSSA]] +; +; CHECK-ALM-LABEL: define float @fadd_reduction_with_live_in( +; CHECK-ALM-SAME: float [[INC:%.*]]) { +; CHECK-ALM-NEXT: entry: +; CHECK-ALM-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-ALM: vector.ph: +; CHECK-ALM-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-ALM: vector.body: +; CHECK-ALM-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-ALM-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-ALM-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-ALM-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult i32 [[TMP0]], 1001 +; CHECK-ALM-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = icmp ult i32 [[TMP1]], 1001 +; CHECK-ALM-NEXT: [[TMP2:%.*]] = select i1 [[ACTIVE_LANE_MASK]], float [[INC]], float -0.000000e+00 +; CHECK-ALM-NEXT: [[TMP3:%.*]] = fadd float [[VEC_PHI]], [[TMP2]] +; CHECK-ALM-NEXT: [[TMP4:%.*]] = select i1 [[ACTIVE_LANE_MASK1]], float [[INC]], float -0.000000e+00 +; CHECK-ALM-NEXT: [[TMP5]] = fadd float [[TMP3]], [[TMP4]] +; CHECK-ALM-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-ALM-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1002 +; CHECK-ALM-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-ALM: middle.block: +; CHECK-ALM-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ALM: scalar.ph: +; CHECK-ALM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1002, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-ALM-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-ALM-NEXT: br label [[LOOP:%.*]] +; CHECK-ALM: loop: +; CHECK-ALM-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-ALM-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP]] ] +; CHECK-ALM-NEXT: [[SUM_NEXT]] = fadd float [[SUM]], [[INC]] +; CHECK-ALM-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-ALM-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 1000 +; CHECK-ALM-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-ALM: exit: +; CHECK-ALM-NEXT: [[LCSSA:%.*]] = phi float [ [[SUM_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] +; CHECK-ALM-NEXT: ret float [[LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %sum = phi float [ 0.000000e+00, %entry ], [ %sum.next, %loop ] + %sum.next = fadd float %sum, %inc + %iv.next = add i32 %iv, 1 + %ec = icmp eq i32 %iv, 1000 + br i1 %ec, label %exit, label %loop + +exit: + %lcssa = phi float [ %sum.next, %loop ] + ret float %lcssa +} From fb1216e82979511a3dfc931e693b575c225c5224 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 7 Feb 2025 13:07:18 -0800 Subject: [PATCH 020/293] [NFC][GlobalISel] Minor cleanup in LegalityQuery constructors (#126285) - Remove a redundant LegalityQuery constructor by using a default value for `MMODescrs` and remove const for ArrayRef arguments. - Use a delegating constructor for `MemDesc` constructor that takes `MachineMemOperand`. --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index 5d7e03bbaeb7d..9472aa196f9b4 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -119,20 +119,17 @@ struct LegalityQuery { MemDesc(LLT MemoryTy, uint64_t AlignInBits, AtomicOrdering Ordering) : MemoryTy(MemoryTy), AlignInBits(AlignInBits), Ordering(Ordering) {} MemDesc(const MachineMemOperand &MMO) - : MemoryTy(MMO.getMemoryType()), - AlignInBits(MMO.getAlign().value() * 8), - Ordering(MMO.getSuccessOrdering()) {} + : MemDesc(MMO.getMemoryType(), MMO.getAlign().value() * 8, + MMO.getSuccessOrdering()) {} }; /// Operations which require memory can use this to place requirements on the /// memory type for each MMO. ArrayRef MMODescrs; - constexpr LegalityQuery(unsigned Opcode, const ArrayRef Types, - const ArrayRef MMODescrs) + constexpr LegalityQuery(unsigned Opcode, ArrayRef Types, + ArrayRef MMODescrs = {}) : Opcode(Opcode), Types(Types), MMODescrs(MMODescrs) {} - constexpr LegalityQuery(unsigned Opcode, const ArrayRef Types) - : LegalityQuery(Opcode, Types, {}) {} raw_ostream &print(raw_ostream &OS) const; }; From 6e5988863177e1d53e7a7abb7a3db2b85376f0f5 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 7 Feb 2025 13:09:58 -0800 Subject: [PATCH 021/293] workflows/premerge: Move concurrency definition to workflow level (#126308) Prior workflow runs were not being cancelled when the pull request was closed, and I think this was why. Also, there is no advantage to having the definitions at the job level. --- .github/workflows/premerge.yaml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index b268f1faab989..178ab191a58be 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -21,15 +21,16 @@ on: - 'main' - 'release/**' +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + jobs: premerge-checks-linux: if: >- github.repository_owner == 'llvm' && (github.event_name != 'pull_request' || github.event.action != 'closed') runs-on: llvm-premerge-linux-runners - concurrency: - group: ${{ github.workflow }}-linux-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true steps: - name: Checkout LLVM uses: actions/checkout@v4 @@ -88,9 +89,6 @@ jobs: github.repository_owner == 'llvm' && (github.event_name != 'pull_request' || github.event.action != 'closed') runs-on: llvm-premerge-windows-runners - concurrency: - group: ${{ github.workflow }}-windows-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true defaults: run: shell: bash @@ -148,9 +146,6 @@ jobs: permerge-check-macos: runs-on: macos-14 - concurrency: - group: ${{ github.workflow }}-macos-${{ github.event.pull_request.number || github.sha }} - cancel-in-progress: true if: >- github.repository_owner == 'llvm' && (startswith(github.ref_name, 'release/') || From 2e0c0931bc891e76effaeb9c82fa08749e61f9b3 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 7 Feb 2025 16:14:40 -0500 Subject: [PATCH 022/293] [gn] fix bug in bfba6215cec2 --- llvm/utils/gn/secondary/clang/test/BUILD.gn | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn index c83c21ac7b549..926407fbea2af 100644 --- a/llvm/utils/gn/secondary/clang/test/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn @@ -67,7 +67,7 @@ write_lit_config("lit_site_cfg") { "ENABLE_SHARED=0", "LLVM_EXTERNAL_LIT=", "LLVM_HOST_TRIPLE=$llvm_current_triple", - "LLVM_INCLUDE_SPIRV_TOOLS_TESTS=", + "LLVM_INCLUDE_SPIRV_TOOLS_TESTS=0", "LLVM_LIT_TOOLS_DIR=", # Intentionally empty, matches cmake build. "LLVM_TOOL_LLVM_DRIVER_BUILD=0", # FIXME: Add actual support for this. "LLVM_USE_SANITIZER=", @@ -75,7 +75,6 @@ write_lit_config("lit_site_cfg") { "Python3_EXECUTABLE=$python_path", "USE_Z3_SOLVER=", "PPC_LINUX_DEFAULT_IEEELONGDOUBLE=0", - "LLVM_INCLUDE_SPIRV_TOOLS_TESTS=0", ] if (clang_enable_static_analyzer) { From 8b12acd2a4a030ad0be76295b98815f93b2631d8 Mon Sep 17 00:00:00 2001 From: Md Abdullah Shahneous Bari <98356296+mshahneo@users.noreply.github.com> Date: Fri, 7 Feb 2025 15:31:47 -0600 Subject: [PATCH 023/293] [mlir][vector][spirv] Handle 1-element vector.{load|store} lowering. (#126294) Add support for single element vector{load|store} lowering to SPIR-V. Since, SPIR-V converts single element vector to scalars, it needs special attention for vector{load|store} lowering to spirv{load|store}. --- .../VectorToSPIRV/VectorToSPIRV.cpp | 29 +++++++++++--- .../VectorToSPIRV/vector-to-spirv.mlir | 39 +++++++++++++++++++ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp index 1ecb892a4ea92..bca77ba68fbd1 100644 --- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp +++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp @@ -770,10 +770,20 @@ struct VectorLoadOpConverter final spirv::StorageClass storageClass = attr.getValue(); auto vectorType = loadOp.getVectorType(); - auto vectorPtrType = spirv::PointerType::get(vectorType, storageClass); - Value castedAccessChain = - rewriter.create(loc, vectorPtrType, accessChain); - rewriter.replaceOpWithNewOp(loadOp, vectorType, + // Use the converted vector type instead of original (single element vector + // would get converted to scalar). + auto spirvVectorType = typeConverter.convertType(vectorType); + auto vectorPtrType = spirv::PointerType::get(spirvVectorType, storageClass); + + // For single element vectors, we don't need to bitcast the access chain to + // the original vector type. Both is going to be the same, a pointer + // to a scalar. + Value castedAccessChain = (vectorType.getNumElements() == 1) + ? accessChain + : rewriter.create( + loc, vectorPtrType, accessChain); + + rewriter.replaceOpWithNewOp(loadOp, spirvVectorType, castedAccessChain); return success(); @@ -806,8 +816,15 @@ struct VectorStoreOpConverter final spirv::StorageClass storageClass = attr.getValue(); auto vectorType = storeOp.getVectorType(); auto vectorPtrType = spirv::PointerType::get(vectorType, storageClass); - Value castedAccessChain = - rewriter.create(loc, vectorPtrType, accessChain); + + // For single element vectors, we don't need to bitcast the access chain to + // the original vector type. Both is going to be the same, a pointer + // to a scalar. + Value castedAccessChain = (vectorType.getNumElements() == 1) + ? accessChain + : rewriter.create( + loc, vectorPtrType, accessChain); + rewriter.replaceOpWithNewOp(storeOp, castedAccessChain, adaptor.getValueToStore()); diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir index 3f0bf1962e299..4701ac5d96009 100644 --- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir +++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir @@ -1004,6 +1004,27 @@ func.func @vector_load(%arg0 : memref<4xf32, #spirv.storage_class return %0: vector<4xf32> } + +// CHECK-LABEL: @vector_load_single_elem +// CHECK-SAME: (%[[ARG0:.*]]: memref<4xf32, #spirv.storage_class>) +// CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<4xf32, #spirv.storage_class> to !spirv.ptr [0])>, StorageBuffer> +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[C0]] : index to i32 +// CHECK: %[[CST1:.+]] = spirv.Constant 0 : i32 +// CHECK: %[[CST2:.+]] = spirv.Constant 0 : i32 +// CHECK: %[[CST3:.+]] = spirv.Constant 1 : i32 +// CHECK: %[[S4:.+]] = spirv.AccessChain %[[S0]][%[[CST1]], %[[S1]]] : !spirv.ptr [0])>, StorageBuffer>, i32, i32 +// CHECK: %[[S5:.+]] = spirv.Load "StorageBuffer" %[[S4]] : f32 +// CHECK: %[[R0:.+]] = builtin.unrealized_conversion_cast %[[S5]] : f32 to vector<1xf32> +// CHECK: return %[[R0]] : vector<1xf32> +func.func @vector_load_single_elem(%arg0 : memref<4xf32, #spirv.storage_class>) -> vector<1xf32> { + %idx = arith.constant 0 : index + %cst_0 = arith.constant 0.000000e+00 : f32 + %0 = vector.load %arg0[%idx] : memref<4xf32, #spirv.storage_class>, vector<1xf32> + return %0: vector<1xf32> +} + + // CHECK-LABEL: @vector_load_2d // CHECK-SAME: (%[[ARG0:.*]]: memref<4x4xf32, #spirv.storage_class>) -> vector<4xf32> { // CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<4x4xf32, #spirv.storage_class> to !spirv.ptr [0])>, StorageBuffer> @@ -1046,6 +1067,24 @@ func.func @vector_store(%arg0 : memref<4xf32, #spirv.storage_class> +// CHECK-SAME: %[[ARG1:.*]]: vector<1xf32> +// CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<4xf32, #spirv.storage_class> to !spirv.ptr [0])>, StorageBuffer> +// CHECK: %[[S1:.+]] = builtin.unrealized_conversion_cast %[[ARG1]] : vector<1xf32> to f32 +// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK: %[[S2:.+]] = builtin.unrealized_conversion_cast %[[C0]] : index to i32 +// CHECK: %[[CST1:.+]] = spirv.Constant 0 : i32 +// CHECK: %[[CST2:.+]] = spirv.Constant 0 : i32 +// CHECK: %[[CST3:.+]] = spirv.Constant 1 : i32 +// CHECK: %[[S4:.+]] = spirv.AccessChain %[[S0]][%[[CST1]], %[[S2]]] : !spirv.ptr [0])>, StorageBuffer>, i32, i32 -> !spirv.ptr +// CHECK: spirv.Store "StorageBuffer" %[[S4]], %[[S1]] : f32 +func.func @vector_store_single_elem(%arg0 : memref<4xf32, #spirv.storage_class>, %arg1 : vector<1xf32>) { + %idx = arith.constant 0 : index + vector.store %arg1, %arg0[%idx] : memref<4xf32, #spirv.storage_class>, vector<1xf32> + return +} + // CHECK-LABEL: @vector_store_2d // CHECK-SAME: (%[[ARG0:.*]]: memref<4x4xf32, #spirv.storage_class> // CHECK-SAME: %[[ARG1:.*]]: vector<4xf32> From 427b24a40879d512802ca4c04d18b0f124e2a684 Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Fri, 7 Feb 2025 21:33:29 +0000 Subject: [PATCH 024/293] [compiler-rt][rtsan] adding readlink(at) interception (#126262) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 23 +++++++++++++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 18 +++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 83e6cdd4a0094..410da0748b433 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -254,6 +254,27 @@ INTERCEPTOR(int, fchdir, int fd) { return REAL(fchdir)(fd); } +#if SANITIZER_INTERCEPT_READLINK +INTERCEPTOR(ssize_t, readlink, const char *pathname, char *buf, size_t size) { + __rtsan_notify_intercepted_call("readlink"); + return REAL(readlink)(pathname, buf, size); +} +#define RTSAN_MAYBE_INTERCEPT_READLINK INTERCEPT_FUNCTION(readlink) +#else +#define RTSAN_MAYBE_INTERCEPT_READLINK +#endif + +#if SANITIZER_INTERCEPT_READLINKAT +INTERCEPTOR(ssize_t, readlinkat, int dirfd, const char *pathname, char *buf, + size_t size) { + __rtsan_notify_intercepted_call("readlinkat"); + return REAL(readlinkat)(dirfd, pathname, buf, size); +} +#define RTSAN_MAYBE_INTERCEPT_READLINKAT INTERCEPT_FUNCTION(readlinkat) +#else +#define RTSAN_MAYBE_INTERCEPT_READLINKAT +#endif + // Streams INTERCEPTOR(FILE *, fopen, const char *path, const char *mode) { @@ -1402,6 +1423,8 @@ void __rtsan::InitializeInterceptors() { INTERCEPT_FUNCTION(close); INTERCEPT_FUNCTION(chdir); INTERCEPT_FUNCTION(fchdir); + RTSAN_MAYBE_INTERCEPT_READLINK; + RTSAN_MAYBE_INTERCEPT_READLINKAT; INTERCEPT_FUNCTION(fopen); RTSAN_MAYBE_INTERCEPT_FOPEN64; RTSAN_MAYBE_INTERCEPT_FREOPEN64; diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 075f5974b7562..98d27caae94b8 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -457,6 +457,24 @@ TEST(TestRtsanInterceptors, FchdirDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +#if SANITIZER_INTERCEPT_READLINK +TEST(TestRtsanInterceptors, ReadlinkDiesWhenRealtime) { + char buf[1024]; + auto Func = [&buf]() { readlink("/proc/self", buf, sizeof(buf)); }; + ExpectRealtimeDeath(Func, "readlink"); + ExpectNonRealtimeSurvival(Func); +} +#endif + +#if SANITIZER_INTERCEPT_READLINKAT +TEST(TestRtsanInterceptors, ReadlinkatDiesWhenRealtime) { + char buf[1024]; + auto Func = [&buf]() { readlinkat(0, "/proc/self", buf, sizeof(buf)); }; + ExpectRealtimeDeath(Func, "readlinkat"); + ExpectNonRealtimeSurvival(Func); +} +#endif + TEST_F(RtsanFileTest, FopenDiesWhenRealtime) { auto Func = [this]() { FILE *f = fopen(GetTemporaryFilePath(), "w"); From 6dbe5422907ff342fd5971a88892aee8f75a25ca Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Fri, 7 Feb 2025 13:36:06 -0800 Subject: [PATCH 025/293] [libc] Use `${libc_opt_high_flag}` instead of `-O3` (#123233) This is preferable since `${libc_opt_high_flag}` will be set correctly for the compiler used. --- libc/cmake/modules/LLVMLibCTestRules.cmake | 4 +++- libc/src/math/generic/CMakeLists.txt | 2 +- libc/test/src/__support/CMakeLists.txt | 2 +- libc/test/src/math/CMakeLists.txt | 6 +++--- libc/test/src/math/exhaustive/CMakeLists.txt | 2 +- libc/test/src/math/smoke/CMakeLists.txt | 8 ++++---- libc/test/src/stdfix/CMakeLists.txt | 16 ++++++++-------- libc/utils/MPFRWrapper/CMakeLists.txt | 2 +- 8 files changed, 22 insertions(+), 20 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 5709a11b1a201..ffbdb40cd5091 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -223,6 +223,8 @@ function(create_libc_unittest fq_target_name) _get_common_test_compile_options(compile_options "${LIBC_UNITTEST_C_TEST}" "${LIBC_UNITTEST_FLAGS}") + # TODO: Ideally we would have a separate function for link options. + set(link_options ${compile_options}) list(APPEND compile_options ${LIBC_UNITTEST_COMPILE_OPTIONS}) if(SHOW_INTERMEDIATE_OBJECTS) @@ -277,7 +279,7 @@ function(create_libc_unittest fq_target_name) target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}) target_compile_options(${fq_build_target_name} PRIVATE ${compile_options}) - target_link_options(${fq_build_target_name} PRIVATE ${compile_options}) + target_link_options(${fq_build_target_name} PRIVATE ${link_options}) if(NOT LIBC_UNITTEST_CXX_STANDARD) set(LIBC_UNITTEST_CXX_STANDARD ${CMAKE_CXX_STANDARD}) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 0e57051807b33..14e63d6cc1395 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -534,7 +534,7 @@ add_entrypoint_object( libc.src.__support.macros.optimization libc.src.__support.macros.properties.types COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} ) add_entrypoint_object( diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index aeb8edf305d05..8d175e857fcd1 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -234,7 +234,7 @@ add_libc_test( libc.src.stdlib.srand libc.src.string.memset COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} UNIT_TEST_ONLY # Aligned Allocation is not supported in hermetic builds. ) diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index bbcdf2363c1e2..f000ff6f3cf47 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1597,7 +1597,7 @@ add_fp_unittest( libc.src.math.sqrtf libc.src.__support.FPUtil.generic.sqrt COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} ) add_fp_unittest( @@ -1613,7 +1613,7 @@ add_fp_unittest( libc.src.math.sqrt libc.src.__support.FPUtil.generic.sqrt COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} ) add_fp_unittest( @@ -1629,7 +1629,7 @@ add_fp_unittest( libc.src.math.sqrtl libc.src.__support.FPUtil.generic.sqrt COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} ) add_fp_unittest( diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt index 423c3b7a8bfd1..b1927dbc19a3b 100644 --- a/libc/test/src/math/exhaustive/CMakeLists.txt +++ b/libc/test/src/math/exhaustive/CMakeLists.txt @@ -305,7 +305,7 @@ add_fp_unittest( SRCS hypotf_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS .exhaustive_test libc.src.math.hypotf diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index e0cb531b40421..f3ecba3737e38 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -2993,7 +2993,7 @@ add_fp_unittest( DEPENDS libc.src.__support.FPUtil.generic.sqrt COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} ) add_fp_unittest( @@ -3007,7 +3007,7 @@ add_fp_unittest( DEPENDS libc.src.__support.FPUtil.generic.sqrt COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} ) add_fp_unittest( @@ -3021,7 +3021,7 @@ add_fp_unittest( DEPENDS libc.src.__support.FPUtil.generic.sqrt COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} ) add_fp_unittest( @@ -3038,7 +3038,7 @@ add_fp_unittest( libc.src.math.sqrtf128 libc.src.__support.FPUtil.generic.sqrt COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} ) add_fp_unittest( diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt index 90d20438edb4b..e4d4fc5b52558 100644 --- a/libc/test/src/stdfix/CMakeLists.txt +++ b/libc/test/src/stdfix/CMakeLists.txt @@ -14,7 +14,7 @@ foreach(suffix IN ITEMS hr r lr hk k lk) SRCS abs${suffix}_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS libc.src.stdfix.abs${suffix} libc.src.__support.fixed_point.fx_bits @@ -31,7 +31,7 @@ foreach(suffix IN ITEMS uhr ur ulr uhk uk) SRCS sqrt${suffix}_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS libc.src.stdfix.sqrt${suffix} libc.src.__support.CPP.bit @@ -52,7 +52,7 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) SRCS round${suffix}_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS libc.src.stdfix.round${suffix} libc.src.__support.fixed_point.fx_bits @@ -67,7 +67,7 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk) SRCS ${suffix}bits_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS libc.src.stdfix.${suffix}bits libc.src.__support.CPP.bit @@ -84,7 +84,7 @@ add_libc_test( SRCS uhksqrtus_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS libc.src.stdfix.uhksqrtus libc.src.__support.CPP.bit @@ -103,7 +103,7 @@ add_libc_test( SRCS uksqrtui_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS libc.src.stdfix.uksqrtui libc.src.__support.CPP.bit @@ -122,7 +122,7 @@ add_libc_test( SRCS exphk_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS libc.src.stdfix.exphk libc.src.math.exp @@ -140,7 +140,7 @@ add_libc_test( SRCS expk_test.cpp COMPILE_OPTIONS - -O3 + ${libc_opt_high_flag} DEPENDS libc.src.stdfix.expk libc.src.math.exp diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt index f7df9146c8d48..9ff7fa109ff97 100644 --- a/libc/utils/MPFRWrapper/CMakeLists.txt +++ b/libc/utils/MPFRWrapper/CMakeLists.txt @@ -34,7 +34,7 @@ if(LIBC_TESTS_CAN_USE_MPFR) _get_common_test_compile_options(compile_options "" "") # mpfr/gmp headers do not work with -ffreestanding flag. list(REMOVE_ITEM compile_options "-ffreestanding") - target_compile_options(libcMPFRWrapper PRIVATE -O3 ${compile_options}) + target_compile_options(libcMPFRWrapper PRIVATE ${libc_opt_high_flag} ${compile_options}) add_dependencies( libcMPFRWrapper libcMPCommon From addbb4448487717283d334e48c63868d6f8553be Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 7 Feb 2025 15:40:45 -0600 Subject: [PATCH 026/293] [LinkerWrapper] Clean up options after proper forwarding (#126297) Summary: Recent changes made a lot of this stuff redundant or unused, clean it up a bit. Also snuck in a change to pass the CUDA path since we still use it for `fatbinary` internally. --- clang/lib/Driver/ToolChains/Clang.cpp | 11 +++++++++ clang/test/Driver/linker-wrapper.c | 24 +++++++++---------- clang/test/Driver/openmp-offload.c | 10 ++++++++ .../ClangLinkerWrapper.cpp | 23 +----------------- .../clang-linker-wrapper/LinkerWrapperOpts.td | 22 ++++------------- 5 files changed, 39 insertions(+), 51 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 62d4336c6be59..0a6756eadba31 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9220,6 +9220,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, for (StringRef Arg : LinkerArgs) CmdArgs.push_back(Args.MakeArgString( "--device-linker=" + TC->getTripleString() + "=" + Arg)); + + // Forward the LTO mode relying on the Driver's parsing. + if (C.getDriver().getOffloadLTOMode() == LTOK_Full) + CmdArgs.push_back(Args.MakeArgString( + "--device-compiler=" + TC->getTripleString() + "=-flto=full")); + else if (C.getDriver().getOffloadLTOMode() == LTOK_Thin) + CmdArgs.push_back(Args.MakeArgString( + "--device-compiler=" + TC->getTripleString() + "=-flto=thin")); } } @@ -9227,6 +9235,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, Args.MakeArgString("--host-triple=" + getToolChain().getTripleString())); if (Args.hasArg(options::OPT_v)) CmdArgs.push_back("--wrapper-verbose"); + if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ)) + CmdArgs.push_back( + Args.MakeArgString(Twine("--cuda-path=") + A->getValue())); // Construct the link job so we can wrap around it. Linker->ConstructJob(C, JA, Output, Inputs, Args, LinkingOutput); diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index f416ee5f4463b..e7b7af7bdfbf3 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -21,16 +21,16 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK -// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o +// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out -// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O0 \ +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-compiler=-g \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG -// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o -g +// NVPTX-LINK-DEBUG: clang{{.*}} --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}-g // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \ @@ -39,16 +39,16 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK -// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \ // RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out -// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --save-temps -O2 \ +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-compiler=--save-temps \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LTO-TEMPS -// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -flto -Wl,--no-undefined {{.*}}.o -save-temps +// AMDGPU-LTO-TEMPS: clang{{.*}} --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}-save-temps // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \ @@ -59,7 +59,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: --linker-path=/usr/bin/ld.lld --whole-archive %t.a --no-whole-archive \ // RUN: %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CPU-LINK -// CPU-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu -march=native -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o -Wl,-Bsymbolic -shared -Wl,--whole-archive {{.*}}.a -Wl,--no-whole-archive +// CPU-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu -march=native -Wl,--no-undefined {{.*}}.o {{.*}}.o -Wl,-Bsymbolic -shared -Wl,--whole-archive {{.*}}.a -Wl,--no-whole-archive // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -mllvm -openmp-opt-disable \ @@ -148,7 +148,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --clang-backend \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CLANG-BACKEND -// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o +// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -Wl,--no-undefined {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 @@ -171,8 +171,8 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t-on.o %t-off.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMD-TARGET-ID -// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o -// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t-lib.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=generic @@ -187,8 +187,8 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t1.o %t2.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL -// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o -// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -Wl,--no-undefined {{.*}}.o {{.*}}.o +// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \ diff --git a/clang/test/Driver/openmp-offload.c b/clang/test/Driver/openmp-offload.c index 6f56ae00ba065..2cf2643af6c15 100644 --- a/clang/test/Driver/openmp-offload.c +++ b/clang/test/Driver/openmp-offload.c @@ -208,3 +208,13 @@ // RUN: -fsyntax-only %s 2>&1 | FileCheck -check-prefix=CHK-SYNTAX-ONLY-ARGS %s // CHK-SYNTAX-ONLY-ARGS: "-cc1" "-triple" "powerpc64le-ibm-linux-gnu"{{.*}}"-fsyntax-only" // CHK-SYNTAX-ONLY-ARGS: "-cc1" "-triple" "powerpc64le-unknown-linux"{{.*}}"-fsyntax-only" + +// +// Ensure `-foffload-lto` is forwarded properly. +// +// RUN: %clang -### --target=powerpc64le-linux -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu \ +// RUN: -foffload-lto %s 2>&1 | FileCheck -check-prefix=CHK-DEVICE-LTO-FULL %s +// CHK-DEVICE-LTO-FULL: clang-linker-wrapper{{.*}} "--device-compiler=powerpc64le-ibm-linux-gnu=-flto=full" +// RUN: %clang -### --target=powerpc64le-linux -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu \ +// RUN: -foffload-lto=thin %s 2>&1 | FileCheck -check-prefix=CHK-DEVICE-LTO-THIN %s +// CHK-DEVICE-LTO-THIN: clang-linker-wrapper{{.*}} "--device-compiler=powerpc64le-ibm-linux-gnu=-flto=thin" diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index b189cfee674dd..1a82a1c59b721 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -485,7 +485,6 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { if (!TempFileOrErr) return TempFileOrErr.takeError(); - StringRef OptLevel = Args.getLastArgValue(OPT_opt_level, "O2"); SmallVector CmdArgs{ *ClangPath, "--no-default-config", @@ -493,12 +492,9 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { *TempFileOrErr, Args.MakeArgString("--target=" + Triple.getTriple()), Triple.isAMDGPU() ? Args.MakeArgString("-mcpu=" + Arch) - : Args.MakeArgString("-march=" + Arch), - Args.MakeArgString("-" + OptLevel), - }; + : Args.MakeArgString("-march=" + Arch)}; // Forward all of the `--offload-opt` and similar options to the device. - CmdArgs.push_back("-flto"); for (auto &Arg : Args.filtered(OPT_offload_opt_eq_minus, OPT_mllvm)) CmdArgs.append( {"-Xlinker", @@ -547,29 +543,12 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { CmdArgs.append({"-Xlinker", Args.MakeArgString( "-mllvm=" + StringRef(Arg->getValue()))}); - if (Args.hasArg(OPT_debug)) - CmdArgs.push_back("-g"); - - if (SaveTemps) - CmdArgs.push_back("-save-temps"); - if (SaveTemps && linkerSupportsLTO(Args)) CmdArgs.push_back("-Wl,--save-temps"); if (Args.hasArg(OPT_embed_bitcode)) CmdArgs.push_back("-Wl,--lto-emit-llvm"); - if (Verbose) - CmdArgs.push_back("-v"); - - if (!CudaBinaryPath.empty()) - CmdArgs.push_back(Args.MakeArgString("--cuda-path=" + CudaBinaryPath)); - - for (StringRef Arg : Args.getAllArgValues(OPT_ptxas_arg)) - llvm::copy( - SmallVector({"-Xcuda-ptxas", Args.MakeArgString(Arg)}), - std::back_inserter(CmdArgs)); - for (StringRef Arg : Args.getAllArgValues(OPT_linker_arg_EQ)) CmdArgs.append({"-Xlinker", Args.MakeArgString(Arg)}); for (StringRef Arg : Args.getAllArgValues(OPT_compiler_arg_EQ)) diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index 57d918db0a73c..17fb9db35fe39 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -17,11 +17,9 @@ def cuda_path_EQ : Joined<["--"], "cuda-path=">, Flags<[WrapperOnlyOption]>, MetaVarName<"">, HelpText<"Set the system CUDA path">; def host_triple_EQ : Joined<["--"], "host-triple=">, - Flags<[WrapperOnlyOption]>, MetaVarName<"">, - HelpText<"Triple to use for the host compilation">; -def opt_level : Joined<["--"], "opt-level=">, - Flags<[WrapperOnlyOption]>, MetaVarName<"">, - HelpText<"Optimization level for LTO">; + Flags<[WrapperOnlyOption]>, + MetaVarName<"">, + HelpText<"Triple to use for the host compilation">; def device_linker_args_EQ : Joined<["--"], "device-linker=">, Flags<[WrapperOnlyOption]>, MetaVarName<" or =">, HelpText<"Arguments to pass to the device linker invocation">; @@ -34,18 +32,8 @@ def dry_run : Flag<["--"], "dry-run">, def verbose : Flag<["--"], "wrapper-verbose">, Flags<[WrapperOnlyOption]>, HelpText<"Verbose output from tools">; def embed_bitcode : Flag<["--"], "embed-bitcode">, - Flags<[WrapperOnlyOption]>, HelpText<"Embed linked bitcode in the module">; -def debug : Flag<["--"], "device-debug">, Flags<[WrapperOnlyOption]>, - HelpText<"Use debugging">; -def ptxas_arg : Joined<["--"], "ptxas-arg=">, - Flags<[WrapperOnlyOption]>, - HelpText<"Argument to pass to the 'ptxas' invocation">; -def pass_remarks_EQ : Joined<["--"], "pass-remarks=">, - Flags<[WrapperOnlyOption]>, HelpText<"Pass remarks for LTO">; -def pass_remarks_missed_EQ : Joined<["--"], "pass-remarks-missed=">, - Flags<[WrapperOnlyOption]>, HelpText<"Pass remarks for LTO">; -def pass_remarks_analysis_EQ : Joined<["--"], "pass-remarks-analysis=">, - Flags<[WrapperOnlyOption]>, HelpText<"Pass remarks for LTO">; + Flags<[WrapperOnlyOption]>, + HelpText<"Embed linked bitcode in the module">; def print_wrapped_module : Flag<["--"], "print-wrapped-module">, Flags<[WrapperOnlyOption]>, HelpText<"Print the wrapped module's IR for testing">; From 76985fd7cafddec5254c15caeeeabc80e5a8c2f9 Mon Sep 17 00:00:00 2001 From: joaosaffran <126493771+joaosaffran@users.noreply.github.com> Date: Fri, 7 Feb 2025 14:19:19 -0800 Subject: [PATCH 027/293] [DXIL] Adding support to RootSignatureFlags in obj2yaml (#122396) This PR adds: - `RootSignatureFlags` extraction from DXContainer using `obj2yaml` This PR is part of: #121493 --------- Co-authored-by: joaosaffran --- llvm/include/llvm/BinaryFormat/DXContainer.h | 24 +++++++ .../BinaryFormat/DXContainerConstants.def | 21 ++++++ .../llvm/MC/DXContainerRootSignature.h | 28 ++++++++ llvm/include/llvm/Object/DXContainer.h | 28 ++++++++ .../include/llvm/ObjectYAML/DXContainerYAML.h | 23 ++++++ llvm/lib/MC/CMakeLists.txt | 1 + llvm/lib/MC/DXContainerRootSignature.cpp | 23 ++++++ llvm/lib/Object/DXContainer.cpp | 61 ++++++++++++++++ llvm/lib/ObjectYAML/DXContainerEmitter.cpp | 15 ++++ llvm/lib/ObjectYAML/DXContainerYAML.cpp | 33 +++++++++ .../DXContainer/RootSignature-Flags.yaml | 33 +++++++++ llvm/tools/obj2yaml/dxcontainer2yaml.cpp | 5 ++ llvm/unittests/Object/DXContainerTest.cpp | 70 +++++++++++++++++++ .../ObjectYAML/DXContainerYAMLTest.cpp | 39 +++++++++++ 14 files changed, 404 insertions(+) create mode 100644 llvm/include/llvm/MC/DXContainerRootSignature.h create mode 100644 llvm/lib/MC/DXContainerRootSignature.cpp create mode 100644 llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index 21e28d546286e..fbab066bf4517 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -14,6 +14,8 @@ #define LLVM_BINARYFORMAT_DXCONTAINER_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/BinaryStreamError.h" +#include "llvm/Support/Error.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/TargetParser/Triple.h" @@ -152,6 +154,11 @@ enum class FeatureFlags : uint64_t { static_assert((uint64_t)FeatureFlags::NextUnusedBit <= 1ull << 63, "Shader flag bits exceed enum size."); +#define ROOT_ELEMENT_FLAG(Num, Val) Val = 1ull << Num, +enum class RootElementFlag : uint32_t { +#include "DXContainerConstants.def" +}; + PartType parsePartType(StringRef S); struct VertexPSVInfo { @@ -541,6 +548,23 @@ struct ProgramSignatureElement { static_assert(sizeof(ProgramSignatureElement) == 32, "ProgramSignatureElement is misaligned"); +struct RootSignatureValidations { + + static Expected validateRootFlag(uint32_t Flags) { + if ((Flags & ~0x80000fff) != 0) + return llvm::make_error("Invalid Root Signature flag"); + return Flags; + } + + static Expected validateVersion(uint32_t Version) { + if (Version == 1 || Version == 2) + return Version; + + return llvm::make_error( + "Invalid Root Signature Version"); + } +}; + } // namespace dxbc } // namespace llvm diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 96d4499c9cadc..6d44ea14df444 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -4,6 +4,7 @@ CONTAINER_PART(DXIL) CONTAINER_PART(SFI0) CONTAINER_PART(HASH) CONTAINER_PART(PSV0) +CONTAINER_PART(RTS0) CONTAINER_PART(ISG1) CONTAINER_PART(OSG1) CONTAINER_PART(PSG1) @@ -52,6 +53,26 @@ SHADER_FEATURE_FLAG(31, 36, NextUnusedBit, "Next reserved shader flag bit (not a #undef SHADER_FEATURE_FLAG #endif // SHADER_FEATURE_FLAG + +// ROOT_ELEMENT_FLAG(bit offset for the flag, name). +#ifdef ROOT_ELEMENT_FLAG + +ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout) +ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess) +ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess) +ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess) +ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess) +ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess) +ROOT_ELEMENT_FLAG(6, AllowStreamOutput) +ROOT_ELEMENT_FLAG(7, LocalRootSignature) +ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess) +ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess) +ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed) +ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed) +#undef ROOT_ELEMENT_FLAG +#endif // ROOT_ELEMENT_FLAG + + #ifdef DXIL_MODULE_FLAG // Only save DXIL module flags which not map to feature flags here. diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h new file mode 100644 index 0000000000000..e1a9be5fc52d8 --- /dev/null +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -0,0 +1,28 @@ +//===- llvm/MC/DXContainerRootSignature.h - RootSignature -*- C++ -*- ========// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace llvm { + +class raw_ostream; + +namespace mcdxbc { +struct RootSignatureHeader { + uint32_t Version = 2; + uint32_t NumParameters = 0; + uint32_t RootParametersOffset = 0; + uint32_t NumStaticSamplers = 0; + uint32_t StaticSamplersOffset = 0; + uint32_t Flags = 0; + + void write(raw_ostream &OS); +}; +} // namespace mcdxbc +} // namespace llvm diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 19c83ba6c6e85..c3a2f756bd683 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -116,6 +116,28 @@ template struct ViewArray { }; namespace DirectX { + +class RootSignature { +private: + uint32_t Version; + uint32_t NumParameters; + uint32_t RootParametersOffset; + uint32_t NumStaticSamplers; + uint32_t StaticSamplersOffset; + uint32_t Flags; + +public: + RootSignature() {} + + Error parse(StringRef Data); + uint32_t getVersion() const { return Version; } + uint32_t getNumParameters() const { return NumParameters; } + uint32_t getRootParametersOffset() const { return RootParametersOffset; } + uint32_t getNumStaticSamplers() const { return NumStaticSamplers; } + uint32_t getStaticSamplersOffset() const { return StaticSamplersOffset; } + uint32_t getFlags() const { return Flags; } +}; + class PSVRuntimeInfo { using ResourceArray = ViewArray; @@ -287,6 +309,7 @@ class DXContainer { std::optional ShaderFeatureFlags; std::optional Hash; std::optional PSVInfo; + std::optional RootSignature; DirectX::Signature InputSignature; DirectX::Signature OutputSignature; DirectX::Signature PatchConstantSignature; @@ -296,6 +319,7 @@ class DXContainer { Error parseDXILHeader(StringRef Part); Error parseShaderFeatureFlags(StringRef Part); Error parseHash(StringRef Part); + Error parseRootSignature(StringRef Part); Error parsePSVInfo(StringRef Part); Error parseSignature(StringRef Part, DirectX::Signature &Array); friend class PartIterator; @@ -382,6 +406,10 @@ class DXContainer { std::optional getShaderHash() const { return Hash; } + std::optional getRootSignature() const { + return RootSignature; + } + const std::optional &getPSVInfo() const { return PSVInfo; }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 66ad057ab0e30..0200f5cb196ff 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -17,6 +17,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/BinaryFormat/DXContainer.h" +#include "llvm/Object/DXContainer.h" #include "llvm/ObjectYAML/YAML.h" #include "llvm/Support/YAMLTraits.h" #include @@ -72,6 +73,22 @@ struct ShaderHash { std::vector Digest; }; +#define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false; +struct RootSignatureDesc { + RootSignatureDesc() = default; + RootSignatureDesc(const object::DirectX::RootSignature &Data); + + uint32_t Version; + uint32_t NumParameters; + uint32_t RootParametersOffset; + uint32_t NumStaticSamplers; + uint32_t StaticSamplersOffset; + + uint32_t getEncodedFlags(); + +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + using ResourceFlags = dxbc::PSV::ResourceFlags; using ResourceBindInfo = dxbc::PSV::v2::ResourceBindInfo; @@ -159,6 +176,7 @@ struct Part { std::optional Hash; std::optional Info; std::optional Signature; + std::optional RootSignature; }; struct Object { @@ -241,6 +259,11 @@ template <> struct MappingTraits { static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El); }; +template <> struct MappingTraits { + static void mapping(IO &IO, + DXContainerYAML::RootSignatureDesc &RootSignature); +}; + } // namespace yaml } // namespace llvm diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt index e1d19196c8766..f49f14c848b90 100644 --- a/llvm/lib/MC/CMakeLists.txt +++ b/llvm/lib/MC/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_component_library(LLVMMC ConstantPools.cpp DXContainerPSVInfo.cpp + DXContainerRootSignature.cpp ELFObjectWriter.cpp GOFFObjectWriter.cpp MCAsmBackend.cpp diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp new file mode 100644 index 0000000000000..000d23f24d241 --- /dev/null +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -0,0 +1,23 @@ +//===- llvm/MC/DXContainerRootSignature.cpp - RootSignature -*- C++ -*-=======// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/DXContainerRootSignature.h" +#include "llvm/Support/EndianStream.h" + +using namespace llvm; +using namespace llvm::mcdxbc; + +void RootSignatureHeader::write(raw_ostream &OS) { + + support::endian::write(OS, Version, llvm::endianness::little); + support::endian::write(OS, NumParameters, llvm::endianness::little); + support::endian::write(OS, RootParametersOffset, llvm::endianness::little); + support::endian::write(OS, NumStaticSamplers, llvm::endianness::little); + support::endian::write(OS, StaticSamplersOffset, llvm::endianness::little); + support::endian::write(OS, Flags, llvm::endianness::little); +} diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 3b1a6203a1f8f..f28b096008b2f 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -10,6 +10,7 @@ #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/Error.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/FormatVariadic.h" using namespace llvm; @@ -92,6 +93,15 @@ Error DXContainer::parseHash(StringRef Part) { return Error::success(); } +Error DXContainer::parseRootSignature(StringRef Part) { + if (RootSignature) + return parseFailed("More than one RTS0 part is present in the file"); + RootSignature = DirectX::RootSignature(); + if (Error Err = RootSignature->parse(Part)) + return Err; + return Error::success(); +} + Error DXContainer::parsePSVInfo(StringRef Part) { if (PSVInfo) return parseFailed("More than one PSV0 part is present in the file"); @@ -193,6 +203,10 @@ Error DXContainer::parsePartOffsets() { break; case dxbc::PartType::Unknown: break; + case dxbc::PartType::RTS0: + if (Error Err = parseRootSignature(PartData)) + return Err; + break; } } @@ -228,6 +242,53 @@ void DXContainer::PartIterator::updateIteratorImpl(const uint32_t Offset) { IteratorState.Offset = Offset; } +Error DirectX::RootSignature::parse(StringRef Data) { + const char *Current = Data.begin(); + + // Root Signature headers expects 6 integers to be present. + if (Data.size() < 6 * sizeof(uint32_t)) + return parseFailed( + "Invalid root signature, insufficient space for header."); + + uint32_t VValue = + support::endian::read(Current); + Current += sizeof(uint32_t); + + Expected MaybeVersion = + dxbc::RootSignatureValidations::validateVersion(VValue); + if (Error E = MaybeVersion.takeError()) + return E; + Version = MaybeVersion.get(); + + NumParameters = + support::endian::read(Current); + Current += sizeof(uint32_t); + + RootParametersOffset = + support::endian::read(Current); + Current += sizeof(uint32_t); + + NumStaticSamplers = + support::endian::read(Current); + Current += sizeof(uint32_t); + + StaticSamplersOffset = + support::endian::read(Current); + Current += sizeof(uint32_t); + + uint32_t FValue = + support::endian::read(Current); + Current += sizeof(uint32_t); + + Expected MaybeFlag = + dxbc::RootSignatureValidations::validateRootFlag(FValue); + if (Error E = MaybeFlag.takeError()) + return E; + Flags = MaybeFlag.get(); + + return Error::success(); +} + Error DirectX::PSVRuntimeInfo::parse(uint16_t ShaderKind) { Triple::EnvironmentType ShaderStage = dxbc::getShaderStage(ShaderKind); diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 175f1a12f9314..b7d1c6558fa1f 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -13,6 +13,7 @@ #include "llvm/BinaryFormat/DXContainer.h" #include "llvm/MC/DXContainerPSVInfo.h" +#include "llvm/MC/DXContainerRootSignature.h" #include "llvm/ObjectYAML/ObjectYAML.h" #include "llvm/ObjectYAML/yaml2obj.h" #include "llvm/Support/Errc.h" @@ -261,6 +262,20 @@ void DXContainerWriter::writeParts(raw_ostream &OS) { } case dxbc::PartType::Unknown: break; // Skip any handling for unrecognized parts. + case dxbc::PartType::RTS0: + if (!P.RootSignature.has_value()) + continue; + + mcdxbc::RootSignatureHeader Header; + Header.Flags = P.RootSignature->getEncodedFlags(); + Header.Version = P.RootSignature->Version; + Header.NumParameters = P.RootSignature->NumParameters; + Header.RootParametersOffset = P.RootSignature->RootParametersOffset; + Header.NumStaticSamplers = P.RootSignature->NumStaticSamplers; + Header.StaticSamplersOffset = P.RootSignature->StaticSamplersOffset; + + Header.write(OS); + break; } uint64_t BytesWritten = OS.tell() - DataStart; RollingOffset += BytesWritten; diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 5dee1221b27c0..0869fd4fa9785 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -29,6 +29,27 @@ DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) { #include "llvm/BinaryFormat/DXContainerConstants.def" } +DXContainerYAML::RootSignatureDesc::RootSignatureDesc( + const object::DirectX::RootSignature &Data) + : Version(Data.getVersion()), NumParameters(Data.getNumParameters()), + RootParametersOffset(Data.getRootParametersOffset()), + NumStaticSamplers(Data.getNumStaticSamplers()), + StaticSamplersOffset(Data.getStaticSamplersOffset()) { + uint32_t Flags = Data.getFlags(); +#define ROOT_ELEMENT_FLAG(Num, Val) \ + Val = (Flags & (uint32_t)dxbc::RootElementFlag::Val) > 0; +#include "llvm/BinaryFormat/DXContainerConstants.def" +} + +uint32_t DXContainerYAML::RootSignatureDesc::getEncodedFlags() { + uint64_t Flag = 0; +#define ROOT_ELEMENT_FLAG(Num, Val) \ + if (Val) \ + Flag |= (uint32_t)dxbc::RootElementFlag::Val; +#include "llvm/BinaryFormat/DXContainerConstants.def" + return Flag; +} + uint64_t DXContainerYAML::ShaderFeatureFlags::getEncodedFlags() { uint64_t Flag = 0; #define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) \ @@ -188,6 +209,17 @@ void MappingTraits::mapping( IO.mapRequired("Parameters", S.Parameters); } +void MappingTraits::mapping( + IO &IO, DXContainerYAML::RootSignatureDesc &S) { + IO.mapRequired("Version", S.Version); + IO.mapRequired("NumParameters", S.NumParameters); + IO.mapRequired("RootParametersOffset", S.RootParametersOffset); + IO.mapRequired("NumStaticSamplers", S.NumStaticSamplers); + IO.mapRequired("StaticSamplersOffset", S.StaticSamplersOffset); +#define ROOT_ELEMENT_FLAG(Num, Val) IO.mapOptional(#Val, S.Val, false); +#include "llvm/BinaryFormat/DXContainerConstants.def" +} + void MappingTraits::mapping(IO &IO, DXContainerYAML::Part &P) { IO.mapRequired("Name", P.Name); @@ -197,6 +229,7 @@ void MappingTraits::mapping(IO &IO, IO.mapOptional("Hash", P.Hash); IO.mapOptional("PSVInfo", P.Info); IO.mapOptional("Signature", P.Signature); + IO.mapOptional("RootSignature", P.RootSignature); } void MappingTraits::mapping( diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml new file mode 100644 index 0000000000000..b0a3e6945f454 --- /dev/null +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-Flags.yaml @@ -0,0 +1,33 @@ +# RUN: yaml2obj %s | obj2yaml | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 60 ] +Parts: + - Name: RTS0 + Size: 24 + RootSignature: + Version: 2 + NumParameters: 1 + RootParametersOffset: 3 + NumStaticSamplers: 4 + StaticSamplersOffset: 5 + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + +# CHECK: - Name: RTS0 +# CHECK-NEXT: Size: 24 +# CHECK-NEXT: RootSignature: +# CHECK-NEXT: Version: 2 +# CHECK-NEXT: NumParameters: 1 +# CHECK-NEXT: RootParametersOffset: 3 +# CHECK-NEXT: NumStaticSamplers: 4 +# CHECK-NEXT: StaticSamplersOffset: 5 +# CHECK-NEXT: AllowInputAssemblerInputLayout: true +# CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp index 06966b1883586..54a912d9438af 100644 --- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp +++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp @@ -153,6 +153,11 @@ dumpDXContainer(MemoryBufferRef Source) { break; case dxbc::PartType::Unknown: break; + case dxbc::PartType::RTS0: + std::optional RS = Container.getRootSignature(); + if (RS.has_value()) + NewPart.RootSignature = DXContainerYAML::RootSignatureDesc(*RS); + break; } } diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 5a2c852d6aef9..88a915f560e05 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -821,3 +821,73 @@ TEST(DXCFile, MalformedSignature) { "the end of the part data")); } } + +TEST(RootSignature, ParseRootFlags) { + { + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, + 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + }; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<68>(Buffer))); + + const auto &RS = C.getRootSignature(); + ASSERT_TRUE(RS.has_value()); + ASSERT_EQ(RS->getVersion(), 2); + ASSERT_EQ(RS->getNumParameters(), 0); + ASSERT_EQ(RS->getRootParametersOffset(), 0); + ASSERT_EQ(RS->getNumStaticSamplers(), 0); + ASSERT_EQ(RS->getStaticSamplersOffset(), 0); + ASSERT_EQ(RS->getFlags(), 0x01); + } + + { + // this parameter has the root signature definition missing some values. + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, + 0x6F, 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, + 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, + 0x00, 0x00, 0x00, 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<64>(Buffer)), + FailedWithMessage( + "Invalid root signature, insufficient space for header.")); + } + { + // Version has been changed to an invalid number. + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, + 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + }; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<68>(Buffer)), + FailedWithMessage("Stream Error: An unspecified error has occurred. " + "Invalid Root Signature Version")); + } + { + // Flag has been set to an invalid value + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, + 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xFF, + }; + EXPECT_THAT_EXPECTED( + DXContainer::create(getMemoryBuffer<68>(Buffer)), + FailedWithMessage("Stream Error: An unspecified error has occurred. " + "Invalid Root Signature flag")); + } +} diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index d4232295c8584..b48cd9ce53987 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -107,3 +107,42 @@ TEST(DXCFile, ParseEmptyParts) { EXPECT_EQ(Storage.size(), 116u); EXPECT_TRUE(memcmp(Buffer, Storage.data(), 116) == 0); } + +TEST(RootSignature, ParseRootFlags) { + SmallString<128> Storage; + + // First read a fully explicit yaml with all sizes and offsets provided + ASSERT_TRUE(convert(Storage, R"(--- !dxcontainer + Header: + Hash: [ 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, 0x5, + 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1 ] + Version: + Major: 1 + Minor: 0 + FileSize: 68 + PartCount: 1 + PartOffsets: [ 36 ] + Parts: + - Name: RTS0 + Size: 24 + RootSignature: + Version: 2 + NumParameters: 0 + RootParametersOffset: 0 + NumStaticSamplers: 0 + StaticSamplersOffset: 0 + AllowInputAssemblerInputLayout: true + )")); + + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x32, 0x9A, 0x53, 0xD8, 0xEC, 0xBE, 0x35, 0x6F, + 0x05, 0x39, 0xE1, 0xFE, 0x31, 0x20, 0xF0, 0xC1, 0x01, 0x00, 0x00, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + }; + + EXPECT_EQ(Storage.size(), 68u); + EXPECT_TRUE(memcmp(Buffer, Storage.data(), 68u) == 0); +} From 756dab4c25c30743a1bdcae8a8e8b8d22f1873b1 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 7 Feb 2025 22:19:30 +0000 Subject: [PATCH 028/293] [gn build] Port 76985fd7cafd --- llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn index 274ab154c441b..c76962ae92c1c 100644 --- a/llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/MC/BUILD.gn @@ -14,6 +14,7 @@ static_library("MC") { sources = [ "ConstantPools.cpp", "DXContainerPSVInfo.cpp", + "DXContainerRootSignature.cpp", "ELFObjectWriter.cpp", "GOFFObjectWriter.cpp", "MCAsmBackend.cpp", From 170cdadf7d1fa254d5a77648b65c4c72e78c8b75 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Fri, 7 Feb 2025 14:50:52 -0800 Subject: [PATCH 029/293] [libc] Exercise all CMake build types in GitHub Action workflows (#126315) We want to test libc in all build configurations: Debug, Release and MinSizeRel which correspond to -O0, -O3 and -Os optimization flags. --- .github/workflows/libc-fullbuild-tests.yml | 3 ++- .github/workflows/libc-overlay-tests.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml index 2c88da653aae4..d93ac84116240 100644 --- a/.github/workflows/libc-fullbuild-tests.yml +++ b/.github/workflows/libc-fullbuild-tests.yml @@ -15,6 +15,7 @@ jobs: strategy: fail-fast: false matrix: + build_type: [Debug, Release, MinSizeRel] include: - os: ubuntu-24.04 ccache-variant: sccache @@ -68,7 +69,7 @@ jobs: cmake -B ${{ steps.strings.outputs.build-output-dir }} -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} - -DCMAKE_BUILD_TYPE=MinSizeRel + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_C_COMPILER_LAUNCHER=${{ matrix.ccache-variant }} -DCMAKE_CXX_COMPILER_LAUNCHER=${{ matrix.ccache-variant }} -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }} diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml index 0a0916084b18c..de4b58c008ee4 100644 --- a/.github/workflows/libc-overlay-tests.yml +++ b/.github/workflows/libc-overlay-tests.yml @@ -16,6 +16,7 @@ jobs: # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. fail-fast: false matrix: + build_type: [Debug, Release, MinSizeRel] include: # TODO: add linux gcc when it is fixed - os: ubuntu-24.04 @@ -95,7 +96,7 @@ jobs: cmake -B ${{ steps.strings.outputs.build-output-dir }} -DCMAKE_CXX_COMPILER=${{ matrix.compiler.cpp_compiler }} -DCMAKE_C_COMPILER=${{ matrix.compiler.c_compiler }} - -DCMAKE_BUILD_TYPE=MinSizeRel + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_C_COMPILER_LAUNCHER=${{ matrix.ccache-variant }} -DCMAKE_CXX_COMPILER_LAUNCHER=${{ matrix.ccache-variant }} -DCMAKE_POLICY_DEFAULT_CMP0141=NEW From 898112e529eae3f4c2210d9b03b14b71869ebc9e Mon Sep 17 00:00:00 2001 From: David Pagan Date: Fri, 7 Feb 2025 14:53:49 -0800 Subject: [PATCH 030/293] [OpenMP][Docs] Update OpenMP supported features table (#126292) Updated status to 'done' for OpenMP 6.0 features: - OpenMP directives in concurrent loop regions - atomics constructs on concurrent loop regions - Lift nesting restriction on concurrent loop Removed duplicate OpenMP 6.0 feature per Michael Klemm: - atomic constructs in loop region --- clang/docs/OpenMPSupport.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index c31d6e90ecb08..725624ee8c66c 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -416,9 +416,9 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | safe_sync and progress with identifier and API | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| OpenMP directives in concurrent loop regions | :none:`unclaimed` | :none:`unclaimed` | | +| OpenMP directives in concurrent loop regions | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| atomics constructs on concurrent loop regions | :none:`unclaimed` | :none:`unclaimed` | | +| atomics constructs on concurrent loop regions | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Loop construct with DO CONCURRENT | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ @@ -456,9 +456,7 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | map-type modifiers in arbitrary position | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| atomic constructs in loop region | :none:`unclaimed` | :none:`unclaimed` | | -+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Lift nesting restriction on concurrent loop | :none:`unclaimed` | :none:`unclaimed` | | +| Lift nesting restriction on concurrent loop | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | priority clause for target constructs | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ From 51e7dc8627e636ff69ee8bcb0bf599b7a3a4957f Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 7 Feb 2025 23:47:23 +0000 Subject: [PATCH 031/293] Use explicit unsigned literals to fix mixed sign comparisons --- llvm/unittests/Object/DXContainerTest.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 88a915f560e05..e7b491103d2d0 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -837,12 +837,12 @@ TEST(RootSignature, ParseRootFlags) { const auto &RS = C.getRootSignature(); ASSERT_TRUE(RS.has_value()); - ASSERT_EQ(RS->getVersion(), 2); - ASSERT_EQ(RS->getNumParameters(), 0); - ASSERT_EQ(RS->getRootParametersOffset(), 0); - ASSERT_EQ(RS->getNumStaticSamplers(), 0); - ASSERT_EQ(RS->getStaticSamplersOffset(), 0); - ASSERT_EQ(RS->getFlags(), 0x01); + ASSERT_EQ(RS->getVersion(), 2u); + ASSERT_EQ(RS->getNumParameters(), 0u); + ASSERT_EQ(RS->getRootParametersOffset(), 0u); + ASSERT_EQ(RS->getNumStaticSamplers(), 0u); + ASSERT_EQ(RS->getStaticSamplersOffset(), 0u); + ASSERT_EQ(RS->getFlags(), 0x01u); } { From 343bbda140d5a15cd7d7fbfc6041a7506da5cdae Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Fri, 7 Feb 2025 23:26:06 +0000 Subject: [PATCH 032/293] Use a stable sort to handle overlapping/duplicate line sequences This can occur due to linker ICF and stable sort will ensure the results are stable. No explicit/new test coverage, because nondeterminism is non-testable. It should already be covered by the DWARFDebugLineTest that was failing some internal testing on an ARM machine which might've been what changed the sort order. But `llvm::sort` also deliberately randomizes the contents (under EXPENSIVE_CHECKS) so I'd have expected failures to show up in any EXPENSIVE_CHECKS Build... --- llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index adcd0aa329420..62bf3d4ecaaf0 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -1274,13 +1274,14 @@ Error DWARFDebugLine::LineTable::parse( // Sort all sequences so that address lookup will work faster. if (!Sequences.empty()) { - llvm::sort(Sequences, Sequence::orderByHighPC); + llvm::stable_sort(Sequences, Sequence::orderByHighPC); // Note: actually, instruction address ranges of sequences should not // overlap (in shared objects and executables). If they do, the address // lookup would still work, though, but result would be ambiguous. // We don't report warning in this case. For example, // sometimes .so compiled from multiple object files contains a few // rudimentary sequences for address ranges [0x0, 0xsomething). + // Address ranges may also overlap when using ICF. } // Terminate the table with a final blank line to clearly delineate it from From e6e8ac59ba45e03da92aebec1f4561c1fa970df1 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Sat, 8 Feb 2025 01:06:27 +0100 Subject: [PATCH 033/293] [Flang] Optionally do not compile the runtime in-tree (#122336) Introduce the CMake switch FLANG_INCLUDE_RUNTIME. When set to off, do not add build instructions for the runtime. This is required for Flang-RT (#110217) and the current runtime CMake code to co-exist. When using `LLVM_ENABLE_RUNTIME=flang-rt`, the in-tree build instructions are in conflict and must be disabled. --- flang/CMakeLists.txt | 6 +++++- flang/examples/CMakeLists.txt | 4 +++- flang/test/CMakeLists.txt | 6 +++++- flang/test/Driver/ctofortran.f90 | 1 + flang/test/Driver/exec.f90 | 1 + flang/test/Runtime/no-cpp-dep.c | 2 +- flang/test/lit.cfg.py | 5 ++++- flang/test/lit.site.cfg.py.in | 2 ++ flang/tools/f18/CMakeLists.txt | 2 +- flang/unittests/CMakeLists.txt | 11 +++++++++- flang/unittests/Evaluate/CMakeLists.txt | 27 +++++++++++++------------ 11 files changed, 47 insertions(+), 20 deletions(-) diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 2e27bc2279ac4..e6de8df5cef15 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -247,6 +247,8 @@ else() include_directories(SYSTEM ${MLIR_TABLEGEN_OUTPUT_DIR}) endif() +option(FLANG_INCLUDE_RUNTIME "Build the runtime in-tree (deprecated; to be replaced with LLVM_ENABLE_RUNTIMES=flang-rt)" ON) + set(FLANG_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')") mark_as_advanced(FLANG_TOOLS_INSTALL_DIR) @@ -487,7 +489,9 @@ if (FLANG_BUILD_TOOLS) add_subdirectory(tools) endif() -add_subdirectory(runtime) +if (FLANG_INCLUDE_RUNTIME) + add_subdirectory(runtime) +endif () if (LLVM_INCLUDE_EXAMPLES) add_subdirectory(examples) diff --git a/flang/examples/CMakeLists.txt b/flang/examples/CMakeLists.txt index 8cc66ddbbbb0e..23c6e790791fb 100644 --- a/flang/examples/CMakeLists.txt +++ b/flang/examples/CMakeLists.txt @@ -1,4 +1,6 @@ -add_subdirectory(ExternalHelloWorld) +if (FLANG_INCLUDE_RUNTIME) + add_subdirectory(ExternalHelloWorld) +endif () add_subdirectory(PrintFlangFunctionNames) add_subdirectory(FlangOmpReport) add_subdirectory(FeatureList) diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt index cab214c2ef4c8..e398e0786147a 100644 --- a/flang/test/CMakeLists.txt +++ b/flang/test/CMakeLists.txt @@ -71,9 +71,13 @@ set(FLANG_TEST_DEPENDS llvm-objdump llvm-readobj split-file - FortranRuntime FortranDecimal ) + +if (FLANG_INCLUDE_RUNTIME) + list(APPEND FLANG_TEST_DEPENDS FortranRuntime) +endif () + if (LLVM_ENABLE_PLUGINS AND NOT WIN32) list(APPEND FLANG_TEST_DEPENDS Bye) endif() diff --git a/flang/test/Driver/ctofortran.f90 b/flang/test/Driver/ctofortran.f90 index 78eac32133b18..10c7adaccc958 100644 --- a/flang/test/Driver/ctofortran.f90 +++ b/flang/test/Driver/ctofortran.f90 @@ -1,4 +1,5 @@ ! UNSUPPORTED: system-windows +! REQUIRES: flang-rt ! RUN: split-file %s %t ! RUN: chmod +x %t/runtest.sh ! RUN: %t/runtest.sh %t %t/ffile.f90 %t/cfile.c %flang | FileCheck %s diff --git a/flang/test/Driver/exec.f90 b/flang/test/Driver/exec.f90 index fd174005ddf62..9ca91ee24011c 100644 --- a/flang/test/Driver/exec.f90 +++ b/flang/test/Driver/exec.f90 @@ -1,4 +1,5 @@ ! UNSUPPORTED: system-windows +! REQUIRES: flang-rt ! Verify that flang can correctly build executables. ! RUN: %flang %s -o %t diff --git a/flang/test/Runtime/no-cpp-dep.c b/flang/test/Runtime/no-cpp-dep.c index b1a5fa004014c..7303ce63fdec4 100644 --- a/flang/test/Runtime/no-cpp-dep.c +++ b/flang/test/Runtime/no-cpp-dep.c @@ -3,7 +3,7 @@ This test makes sure that flang's runtime does not depend on the C++ runtime library. It tries to link this simple file against libFortranRuntime.a with a C compiler. -REQUIRES: c-compiler +REQUIRES: c-compiler, flang-rt RUN: %if system-aix %{ export OBJECT_MODE=64 %} RUN: %cc -std=c99 %s -I%include %libruntime -lm \ diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py index c452b6d231c89..f4580afc8c47b 100644 --- a/flang/test/lit.cfg.py +++ b/flang/test/lit.cfg.py @@ -163,10 +163,13 @@ ToolSubst("%not_todo_abort_cmd", command=FindTool("not"), unresolved="fatal") ) +if config.flang_include_runtime: + config.available_features.add("flang-rt") + # Define some variables to help us test that the flang runtime doesn't depend on # the C++ runtime libraries. For this we need a C compiler. If for some reason # we don't have one, we can just disable the test. -if config.cc: +if config.flang_include_runtime and config.cc: libruntime = os.path.join(config.flang_lib_dir, "libFortranRuntime.a") include = os.path.join(config.flang_src_dir, "include") diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in index d1a0ac763cf8a..697ba3fa79763 100644 --- a/flang/test/lit.site.cfg.py.in +++ b/flang/test/lit.site.cfg.py.in @@ -1,6 +1,7 @@ @LIT_SITE_CFG_IN_HEADER@ import sys +import lit.util config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@") config.llvm_shlib_dir = lit_config.substitute(path(r"@SHLIBDIR@")) @@ -32,6 +33,7 @@ else: config.openmp_module_dir = None config.flang_runtime_f128_math_lib = "@FLANG_RUNTIME_F128_MATH_LIB@" config.have_ldbl_mant_dig_113 = "@HAVE_LDBL_MANT_DIG_113@" +config.flang_include_runtime = lit.util.pythonize_bool("@FLANG_INCLUDE_RUNTIME@") import lit.llvm lit.llvm.initialize(lit_config, config) diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index cc2bc5b8eb5ce..85ba2c74cdeb5 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -109,7 +109,7 @@ if (NOT CMAKE_CROSSCOMPILING) set(compile_with "-fsyntax-only") set(object_output "") set(include_in_link FALSE) - if(${filename} IN_LIST MODULES_WITH_IMPLEMENTATION) + if(${filename} IN_LIST MODULES_WITH_IMPLEMENTATION AND FLANG_INCLUDE_RUNTIME) set(object_output "${CMAKE_CURRENT_BINARY_DIR}/${filename}${CMAKE_CXX_OUTPUT_EXTENSION}") set(compile_with -c -o ${object_output}) set(include_in_link TRUE) diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt index 945067fed4f82..ecb7d68d8f729 100644 --- a/flang/unittests/CMakeLists.txt +++ b/flang/unittests/CMakeLists.txt @@ -1,3 +1,5 @@ +include(AddFlangOffloadRuntime) + if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) # If Fortran runtime is built as CUDA library, the linking # of targets that link FortranRuntime must be done @@ -11,6 +13,11 @@ add_custom_target(FlangUnitTests) set_target_properties(FlangUnitTests PROPERTIES FOLDER "Flang/Tests") function(add_flang_unittest_offload_properties target) + # Do not apply runtime properties if not even compiling the runtime. + if (NOT FLANG_INCLUDE_RUNTIME) + return () + endif () + # Set CUDA_RESOLVE_DEVICE_SYMBOLS. if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) set_target_properties(${target} @@ -75,5 +82,7 @@ add_subdirectory(Optimizer) add_subdirectory(Common) add_subdirectory(Decimal) add_subdirectory(Evaluate) -add_subdirectory(Runtime) +if (FLANG_INCLUDE_RUNTIME) + add_subdirectory(Runtime) +endif () add_subdirectory(Frontend) diff --git a/flang/unittests/Evaluate/CMakeLists.txt b/flang/unittests/Evaluate/CMakeLists.txt index 8111ecd72cfc7..1c3fac29cd298 100644 --- a/flang/unittests/Evaluate/CMakeLists.txt +++ b/flang/unittests/Evaluate/CMakeLists.txt @@ -33,7 +33,6 @@ add_flang_nongtest_unittest(intrinsics FortranDecimal FortranSemantics FortranParser - FortranRuntime ) add_flang_nongtest_unittest(logical @@ -56,19 +55,21 @@ add_flang_nongtest_unittest(real ) llvm_update_compile_flags(real.test) -add_flang_nongtest_unittest(reshape - NonGTestTesting - FortranSemantics - FortranEvaluate - FortranRuntime -) +if (FLANG_INCLUDE_RUNTIME) + add_flang_nongtest_unittest(reshape + NonGTestTesting + FortranSemantics + FortranEvaluate + FortranRuntime + ) -add_flang_nongtest_unittest(ISO-Fortran-binding - NonGTestTesting - FortranEvaluate - FortranSemantics - FortranRuntime -) + add_flang_nongtest_unittest(ISO-Fortran-binding + NonGTestTesting + FortranEvaluate + FortranSemantics + FortranRuntime + ) +endif () add_flang_nongtest_unittest(folding FortranSupport From 3e2afe5f019b7a1c60e23cb2743018bd2d0417b1 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Fri, 7 Feb 2025 16:57:11 -0800 Subject: [PATCH 034/293] Revert "[Fuchsia] Support PGO" (#126293) Reverts llvm/llvm-project#120323 This breaks some internal Fuchsia builders. We can reland again later, once that is addresed. --- .../caches/Fuchsia-stage2-instrumented.cmake | 44 --------------- clang/cmake/caches/Fuchsia.cmake | 55 ++++++++----------- 2 files changed, 23 insertions(+), 76 deletions(-) delete mode 100644 clang/cmake/caches/Fuchsia-stage2-instrumented.cmake diff --git a/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake b/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake deleted file mode 100644 index b3c3b63066363..0000000000000 --- a/clang/cmake/caches/Fuchsia-stage2-instrumented.cmake +++ /dev/null @@ -1,44 +0,0 @@ -# This file sets up a CMakeCache for the second stage of a Fuchsia toolchain build. - -include(${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2.cmake) - -if(NOT APPLE) - set(BOOTSTRAP_LLVM_ENABLE_LLD ON CACHE BOOL "") -endif() - -set(CLANG_BOOTSTRAP_TARGETS - check-all - check-clang - check-lld - check-llvm - clang - clang-test-depends - toolchain-distribution - install-toolchain-distribution - install-toolchain-distribution-stripped - install-toolchain-distribution-toolchain - lld-test-depends - llvm-config - llvm-test-depends - test-depends - test-suite CACHE STRING "") - -get_cmake_property(variableNames VARIABLES) -foreach(variableName ${variableNames}) - if(variableName MATCHES "^STAGE2_") - string(REPLACE "STAGE2_" "" new_name ${variableName}) - list(APPEND EXTRA_ARGS "-D${new_name}=${${variableName}}") - endif() -endforeach() - -set(CLANG_PGO_TRAINING_DEPS - builtins - runtimes - CACHE STRING "") - -# Setup the bootstrap build. -set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "") -set(CLANG_BOOTSTRAP_CMAKE_ARGS - ${EXTRA_ARGS} - -C ${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2.cmake - CACHE STRING "") diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake index 373b7ddd6e344..83336589da305 100644 --- a/clang/cmake/caches/Fuchsia.cmake +++ b/clang/cmake/caches/Fuchsia.cmake @@ -126,16 +126,6 @@ else() set(LIBCXX_ENABLE_STATIC_ABI_LIBRARY ON CACHE BOOL "") set(LIBCXX_HARDENING_MODE "none" CACHE STRING "") set(LIBCXX_USE_COMPILER_RT ON CACHE BOOL "") - set(COMPILER_RT_BUILD_LIBFUZZER OFF CACHE BOOL "") - set(COMPILER_RT_BUILD_PROFILE ON CACHE BOOL "") - set(COMPILER_RT_BUILD_SANITIZERS OFF CACHE BOOL "") - set(COMPILER_RT_BUILD_XRAY OFF CACHE BOOL "") - set(COMPILER_RT_USE_BUILTINS_LIBRARY ON CACHE BOOL "") - set(COMPILER_RT_DEFAULT_TARGET_ONLY ON CACHE BOOL "") - set(SANITIZER_CXX_ABI "libc++" CACHE STRING "") - set(SANITIZER_CXX_ABI_INTREE ON CACHE BOOL "") - set(SANITIZER_TEST_CXX "libc++" CACHE STRING "") - set(SANITIZER_TEST_CXX_INTREE ON CACHE BOOL "") set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "") set(RUNTIMES_CMAKE_ARGS "-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13;-DCMAKE_OSX_ARCHITECTURES=arm64|x86_64" CACHE STRING "") endif() @@ -174,29 +164,34 @@ endif() set(BOOTSTRAP_LLVM_ENABLE_LLD ON CACHE BOOL "") set(BOOTSTRAP_LLVM_ENABLE_LTO ON CACHE BOOL "") -set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED ON CACHE BOOL "") set(_FUCHSIA_BOOTSTRAP_TARGETS - generate-profdata - stage2 - stage2-toolchain-distribution - stage2-install-toolchain-distribution - stage2-install-toolchain-distribution-stripped - stage2-install-toolchain-distribution-toolchain - stage2-check-all - stage2-check-lld - stage2-check-llvm - stage2-check-clang - stage2-test-suite) + check-all + check-clang + check-lld + check-llvm + check-polly + llvm-config + clang-test-depends + lld-test-depends + llvm-test-depends + test-suite + test-depends + toolchain-distribution + install-toolchain-distribution + install-toolchain-distribution-stripped + install-toolchain-distribution-toolchain + clang) if(FUCHSIA_ENABLE_LLDB) list(APPEND _FUCHSIA_ENABLE_PROJECTS lldb) list(APPEND _FUCHSIA_BOOTSTRAP_TARGETS - stage2-check-lldb - stage2-debugger-distribution - stage2-install-debugger-distribution - stage2-install-debugger-distribution-stripped - stage2-install-debugger-distribution-toolchain) + check-lldb + lldb-test-depends + debugger-distribution + install-debugger-distribution + install-debugger-distribution-stripped + install-debugger-distribution-toolchain) endif() set(LLVM_ENABLE_PROJECTS ${_FUCHSIA_ENABLE_PROJECTS} CACHE STRING "") @@ -205,7 +200,6 @@ set(CLANG_BOOTSTRAP_TARGETS ${_FUCHSIA_BOOTSTRAP_TARGETS} CACHE STRING "") get_cmake_property(variableNames VARIABLES) foreach(variableName ${variableNames}) if(variableName MATCHES "^STAGE2_") - list(APPEND EXTRA_ARGS "-D${variableName}=${${variableName}}") string(REPLACE "STAGE2_" "" new_name ${variableName}) string(REPLACE ";" "|" value "${${variableName}}") list(APPEND EXTRA_ARGS "-D${new_name}=${value}") @@ -215,9 +209,6 @@ endforeach() # TODO: This is a temporary workaround until we figure out the right solution. set(BOOTSTRAP_LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "") -set(LLVM_BUILTIN_TARGETS "default" CACHE STRING "") -set(LLVM_RUNTIME_TARGETS "default" CACHE STRING "") - # Setup the bootstrap build. set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "") set(CLANG_BOOTSTRAP_EXTRA_DEPS @@ -226,5 +217,5 @@ set(CLANG_BOOTSTRAP_EXTRA_DEPS CACHE STRING "") set(CLANG_BOOTSTRAP_CMAKE_ARGS ${EXTRA_ARGS} - -C ${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2-instrumented.cmake + -C ${CMAKE_CURRENT_LIST_DIR}/Fuchsia-stage2.cmake CACHE STRING "") From 7464dc8c7618aeb5a01998576bbcc4c88f0dde1d Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Fri, 7 Feb 2025 17:13:56 -0800 Subject: [PATCH 035/293] [RISCV] Include RISCVGenSearchTable.inc in RISCVISelDAGToDAG.h (#126326) This line was previously removed when 12d47247e5046b959af180e12f648c54e2c5e863 moved it to RISCVInstrInfo.h. But we probably don't want to have dangling `#define *_DECL` (RISCVGenSearchableTables.inc will `#undef` these macros) and I think there is no harm putting declarations of those search table functions in RISCVISelDAGToDAG.h. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index 592f517358506..bb786e4b2bb40 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -278,6 +278,7 @@ struct VLX_VSXPseudo { #define GET_RISCVVSETable_DECL #define GET_RISCVVLXTable_DECL #define GET_RISCVVSXTable_DECL +#include "RISCVGenSearchableTables.inc" } // namespace RISCV } // namespace llvm From 51ba9819b40e04ef0ddbe141d3d30c32a295a0bc Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Sat, 8 Feb 2025 09:26:59 +0800 Subject: [PATCH 036/293] [libc++][test] Fix `size_type` issues with `MinSequenceContainer` and `min_allocator` (#126267) `MinSequenceContainer::size` can be narrowing on 64-bit platforms, and MSVC complains about such implicit conversion. This PR changes the implicit conversion to explicit `static_cast`. `min_allocator::allocate` and `min_allocator::deallocate` have `ptrdiff_t` as the parameter type, which seems weird, because the underlying `std::allocator`'s member functions take `size_t`. It seems better to use `size_t` consistently. --- libcxx/test/support/MinSequenceContainer.h | 2 +- libcxx/test/support/min_allocator.h | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/libcxx/test/support/MinSequenceContainer.h b/libcxx/test/support/MinSequenceContainer.h index d0e29ae40c400..7fee4dd0fbdc1 100644 --- a/libcxx/test/support/MinSequenceContainer.h +++ b/libcxx/test/support/MinSequenceContainer.h @@ -31,7 +31,7 @@ struct MinSequenceContainer { const_iterator cbegin() const { return const_iterator(data_.data()); } iterator end() { return begin() + size(); } const_iterator end() const { return begin() + size(); } - size_type size() const { return data_.size(); } + size_type size() const { return static_cast(data_.size()); } bool empty() const { return data_.empty(); } void clear() { data_.clear(); } diff --git a/libcxx/test/support/min_allocator.h b/libcxx/test/support/min_allocator.h index 18f51f8072640..d3ee27a23bc89 100644 --- a/libcxx/test/support/min_allocator.h +++ b/libcxx/test/support/min_allocator.h @@ -394,15 +394,9 @@ class min_allocator template TEST_CONSTEXPR_CXX20 min_allocator(min_allocator) {} - TEST_CONSTEXPR_CXX20 pointer allocate(std::ptrdiff_t n) - { - return pointer(std::allocator().allocate(n)); - } + TEST_CONSTEXPR_CXX20 pointer allocate(std::size_t n) { return pointer(std::allocator().allocate(n)); } - TEST_CONSTEXPR_CXX20 void deallocate(pointer p, std::ptrdiff_t n) - { - std::allocator().deallocate(p.ptr_, n); - } + TEST_CONSTEXPR_CXX20 void deallocate(pointer p, std::size_t n) { std::allocator().deallocate(p.ptr_, n); } TEST_CONSTEXPR_CXX20 friend bool operator==(min_allocator, min_allocator) {return true;} TEST_CONSTEXPR_CXX20 friend bool operator!=(min_allocator x, min_allocator y) {return !(x == y);} From 12a154a94a9c2f6f0690adc7302da9c9e47ec806 Mon Sep 17 00:00:00 2001 From: Ami-zhang Date: Sat, 8 Feb 2025 09:48:41 +0800 Subject: [PATCH 037/293] [libunwind] Unwind through loongarch64/Linux sigreturn frame (#123682) Similar to D90898 (Linux AArch64), D124765 (SystemZ), and D148499 (RISCV). In this commit, I enabled two test cases, while zhuqizheng supported with the source code development. Co-Authored-By: zhuqizheng Co-authored-by: zhuqizheng --- libunwind/src/UnwindCursor.hpp | 64 ++++++++++++++++++++- libunwind/test/signal_unwind.pass.cpp | 2 +- libunwind/test/unwind_leaffunction.pass.cpp | 2 +- 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp index 3831d8e071ef3..0923052b1b588 100644 --- a/libunwind/src/UnwindCursor.hpp +++ b/libunwind/src/UnwindCursor.hpp @@ -31,8 +31,9 @@ #endif #if defined(_LIBUNWIND_TARGET_LINUX) && \ - (defined(_LIBUNWIND_TARGET_AARCH64) || defined(_LIBUNWIND_TARGET_RISCV) || \ - defined(_LIBUNWIND_TARGET_S390X)) + (defined(_LIBUNWIND_TARGET_AARCH64) || \ + defined(_LIBUNWIND_TARGET_LOONGARCH) || \ + defined(_LIBUNWIND_TARGET_RISCV) || defined(_LIBUNWIND_TARGET_S390X)) #include #include #include @@ -996,6 +997,10 @@ class UnwindCursor : public AbstractUnwindCursor{ bool setInfoForSigReturn(Registers_arm64 &); int stepThroughSigReturn(Registers_arm64 &); #endif +#if defined(_LIBUNWIND_TARGET_LOONGARCH) + bool setInfoForSigReturn(Registers_loongarch &); + int stepThroughSigReturn(Registers_loongarch &); +#endif #if defined(_LIBUNWIND_TARGET_RISCV) bool setInfoForSigReturn(Registers_riscv &); int stepThroughSigReturn(Registers_riscv &); @@ -2815,6 +2820,61 @@ int UnwindCursor::stepThroughSigReturn() { #endif // defined(_LIBUNWIND_CHECK_LINUX_SIGRETURN) && // defined(_LIBUNWIND_TARGET_AARCH64) +#if defined(_LIBUNWIND_CHECK_LINUX_SIGRETURN) && \ + defined(_LIBUNWIND_TARGET_LOONGARCH) +template +bool UnwindCursor::setInfoForSigReturn(Registers_loongarch &) { + const pint_t pc = static_cast(getReg(UNW_REG_IP)); + // The PC might contain an invalid address if the unwind info is bad, so + // directly accessing it could cause a SIGSEGV. + if (!isReadableAddr(pc)) + return false; + const auto *instructions = reinterpret_cast(pc); + // Look for the two instructions used in the sigreturn trampoline + // __vdso_rt_sigreturn: + // + // 0x03822c0b li a7,0x8b + // 0x002b0000 syscall 0 + if (instructions[0] != 0x03822c0b || instructions[1] != 0x002b0000) + return false; + + _info = {}; + _info.start_ip = pc; + _info.end_ip = pc + 4; + _isSigReturn = true; + return true; +} + +template +int UnwindCursor::stepThroughSigReturn(Registers_loongarch &) { + // In the signal trampoline frame, sp points to an rt_sigframe[1], which is: + // - 128-byte siginfo struct + // - ucontext_t struct: + // - 8-byte long (__uc_flags) + // - 8-byte pointer (*uc_link) + // - 24-byte uc_stack + // - 8-byte uc_sigmask + // - 120-byte of padding to allow sigset_t to be expanded in the future + // - 8 bytes of padding because sigcontext has 16-byte alignment + // - struct sigcontext uc_mcontext + // [1] + // https://github.com/torvalds/linux/blob/master/arch/loongarch/kernel/signal.c + const pint_t kOffsetSpToSigcontext = 128 + 8 + 8 + 24 + 8 + 128; + + const pint_t sigctx = _registers.getSP() + kOffsetSpToSigcontext; + _registers.setIP(_addressSpace.get64(sigctx)); + for (int i = UNW_LOONGARCH_R1; i <= UNW_LOONGARCH_R31; ++i) { + // skip R0 + uint64_t value = + _addressSpace.get64(sigctx + static_cast((i + 1) * 8)); + _registers.setRegister(i, value); + } + _isSignalFrame = true; + return UNW_STEP_SUCCESS; +} +#endif // defined(_LIBUNWIND_CHECK_LINUX_SIGRETURN) && + // defined(_LIBUNWIND_TARGET_LOONGARCH) + #if defined(_LIBUNWIND_CHECK_LINUX_SIGRETURN) && \ defined(_LIBUNWIND_TARGET_RISCV) template diff --git a/libunwind/test/signal_unwind.pass.cpp b/libunwind/test/signal_unwind.pass.cpp index 1c1566415a4d4..4de271ecb886b 100644 --- a/libunwind/test/signal_unwind.pass.cpp +++ b/libunwind/test/signal_unwind.pass.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // Ensure that the unwinder can cope with the signal handler. -// REQUIRES: target={{(aarch64|riscv64|s390x|x86_64)-.+linux.*}} +// REQUIRES: target={{(aarch64|loongarch64|riscv64|s390x|x86_64)-.+linux.*}} // TODO: Figure out why this fails with Memory Sanitizer. // XFAIL: msan diff --git a/libunwind/test/unwind_leaffunction.pass.cpp b/libunwind/test/unwind_leaffunction.pass.cpp index 98de7dc43260c..d336c159c131b 100644 --- a/libunwind/test/unwind_leaffunction.pass.cpp +++ b/libunwind/test/unwind_leaffunction.pass.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // Ensure that leaf function can be unwund. -// REQUIRES: target={{(aarch64|riscv64|s390x|x86_64)-.+linux.*}} +// REQUIRES: target={{(aarch64|loongarch64|riscv64|s390x|x86_64)-.+linux.*}} // TODO: Figure out why this fails with Memory Sanitizer. // XFAIL: msan From ff79d83caeeea8457f69406f38801fe8893bbfd8 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Sat, 8 Feb 2025 10:31:48 +0800 Subject: [PATCH 038/293] [LLDB][LoongArch] Extend the maximum number of watchpoints (#126204) The maximum number of load/store watchpoints and fetch instruction watchpoints is 14 each according to LoongArch Reference Manual [1], so extend the maximum number of watchpoints from 8 to 14 for ptrace. A new struct user_watch_state_v2 was added into uapi in the related kernel commit 531936dee53e ("LoongArch: Extend the maximum number of watchpoints") [2], but there may be no struct user_watch_state_v2 in the system header in time. In order to avoid undefined or redefined error, just add a new struct loongarch_user_watch_state in LLDB which is same with the uapi struct user_watch_state_v2, then replace the current user_watch_state with loongarch_user_watch_state. As far as I can tell, the only users for this struct in the userspace are GDB and LLDB, there are no any problems of software compatibility between the application and kernel according to the analysis. The compatibility problem has been considered while developing and testing. When the applications in the userspace get watchpoint state, the length will be specified which is no bigger than the sizeof struct user_watch_state or user_watch_state_v2, the actual length is assigned as the minimal value of the application and kernel in the generic code of ptrace: ``` kernel/ptrace.c: ptrace_regset(): kiov->iov_len = min(kiov->iov_len, (__kernel_size_t) (regset->n * regset->size)); if (req == PTRACE_GETREGSET) return copy_regset_to_user(task, view, regset_no, 0, kiov->iov_len, kiov->iov_base); else return copy_regset_from_user(task, view, regset_no, 0, kiov->iov_len, kiov->iov_base); ``` For example, there are four kind of combinations, all of them work well. (1) "older kernel + older app", the actual length is 8+(8+8+4+4)*8=200; (2) "newer kernel + newer app", the actual length is 8+(8+8+4+4)*14=344; (3) "older kernel + newer app", the actual length is 8+(8+8+4+4)*8=200; (4) "newer kernel + older app", the actual length is 8+(8+8+4+4)*8=200. [1] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#control-and-status-registers-related-to-watchpoints [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=531936dee53e Signed-off-by: Tiezhu Yang --- ...NativeRegisterContextLinux_loongarch64.cpp | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp index 601dde2500948..c4841950f1e07 100644 --- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp @@ -50,6 +50,23 @@ #define REG_CONTEXT_SIZE \ (GetGPRSize() + GetFPRSize() + sizeof(m_lsx) + sizeof(m_lasx)) +// ptrace has a struct type user_watch_state, which was replaced by +// user_watch_state_v2 when more watchpoints were added, so this file +// may be built on systems with one or both in the system headers. +// The type below has the same layout as user_watch_state_v2 but will +// not clash with that name if it exists. We can use the v2 layout even +// on old kernels as we will only see 8 watchpoints and the kernel will +// truncate any extra data we send to it. +struct loongarch_user_watch_state { + uint64_t dbg_info; + struct { + uint64_t addr; + uint64_t mask; + uint32_t ctrl; + uint32_t pad; + } dbg_regs[14]; +}; + using namespace lldb; using namespace lldb_private; using namespace lldb_private::process_linux; @@ -539,7 +556,7 @@ llvm::Error NativeRegisterContextLinux_loongarch64::ReadHardwareDebugInfo() { int regset = NT_LOONGARCH_HW_WATCH; struct iovec ioVec; - struct user_watch_state dreg_state; + struct loongarch_user_watch_state dreg_state; Status error; ioVec.iov_base = &dreg_state; @@ -567,7 +584,7 @@ llvm::Error NativeRegisterContextLinux_loongarch64::ReadHardwareDebugInfo() { llvm::Error NativeRegisterContextLinux_loongarch64::WriteHardwareDebugRegs( DREGType hwbType) { struct iovec ioVec; - struct user_watch_state dreg_state; + struct loongarch_user_watch_state dreg_state; int regset; memset(&dreg_state, 0, sizeof(dreg_state)); From b850ce41db1e90cb2573ab5880da1d05de7828fd Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Sat, 8 Feb 2025 08:35:10 +0530 Subject: [PATCH 039/293] [MLIR][Affine] Fix private memref creation bug in affine fusion (#126028) Fix private memref creation bug in affine fusion exposed in the case of the same memref being loaded from/stored to in producer nest. Make the private memref replacement sound. Change affine fusion debug string to affine-fusion - more compact. Fixes: https://github.com/llvm/llvm-project/issues/48703 --- .../mlir/Dialect/Affine/Analysis/Utils.h | 8 ++ mlir/lib/Dialect/Affine/Analysis/Utils.cpp | 39 +++++++++ .../Dialect/Affine/Transforms/LoopFusion.cpp | 83 ++++++++++++++----- mlir/test/Dialect/Affine/loop-fusion-4.mlir | 60 ++++++++++++++ 4 files changed, 170 insertions(+), 20 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h b/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h index b1fbf4477428c..7164ade6ea53a 100644 --- a/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h @@ -610,6 +610,14 @@ FailureOr simplifyConstrainedMinMaxOp(Operation *op, FlatAffineValueConstraints constraints); +/// Find the innermost common `Block` of `a` and `b` in the affine scope +/// that `a` and `b` are part of. Return nullptr if they belong to different +/// affine scopes. Also, return nullptr if they do not have a common `Block` +/// ancestor (for eg., when they are part of the `then` and `else` regions +/// of an op that itself starts an affine scope. +mlir::Block *findInnermostCommonBlockInScope(mlir::Operation *a, + mlir::Operation *b); + } // namespace affine } // namespace mlir diff --git a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp index 9c0b5dbf52d29..10de0d04cbea6 100644 --- a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Affine/Analysis/Utils.h" + #include "mlir/Analysis/Presburger/PresburgerRelation.h" #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" @@ -2297,3 +2298,41 @@ FailureOr mlir::affine::simplifyConstrainedMinMaxOp( affine::canonicalizeMapAndOperands(&newMap, &newOperands); return AffineValueMap(newMap, newOperands); } + +Block *mlir::affine::findInnermostCommonBlockInScope(Operation *a, + Operation *b) { + Region *aScope = mlir::affine::getAffineScope(a); + Region *bScope = mlir::affine::getAffineScope(b); + if (aScope != bScope) + return nullptr; + + // Get the block ancestry of `op` while stopping at the affine scope `aScope` + // and store them in `ancestry`. + auto getBlockAncestry = [&](Operation *op, + SmallVectorImpl &ancestry) { + Operation *curOp = op; + do { + ancestry.push_back(curOp->getBlock()); + if (curOp->getParentRegion() == aScope) + break; + curOp = curOp->getParentOp(); + } while (curOp); + assert(curOp && "can't reach root op without passing through affine scope"); + std::reverse(ancestry.begin(), ancestry.end()); + }; + + SmallVector aAncestors, bAncestors; + getBlockAncestry(a, aAncestors); + getBlockAncestry(b, bAncestors); + assert(!aAncestors.empty() && !bAncestors.empty() && + "at least one Block ancestor expected"); + + Block *innermostCommonBlock = nullptr; + for (unsigned a = 0, b = 0, e = aAncestors.size(), f = bAncestors.size(); + a < e && b < f; ++a, ++b) { + if (aAncestors[a] != bAncestors[b]) + break; + innermostCommonBlock = aAncestors[a]; + } + return innermostCommonBlock; +} diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp index c22ec213be95c..fe6cf0f434cb7 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp @@ -41,7 +41,7 @@ namespace affine { } // namespace affine } // namespace mlir -#define DEBUG_TYPE "affine-loop-fusion" +#define DEBUG_TYPE "affine-fusion" using namespace mlir; using namespace mlir::affine; @@ -237,29 +237,67 @@ static void sinkSequentialLoops(MemRefDependenceGraph::Node *node) { node->op = newRootForOp; } -// Creates and returns a private (single-user) memref for fused loop rooted -// at 'forOp', with (potentially reduced) memref size based on the -// MemRefRegion written to by 'srcStoreOpInst' at depth 'dstLoopDepth'. -// TODO: consider refactoring the common code from generateDma and -// this one. -static Value createPrivateMemRef(AffineForOp forOp, Operation *srcStoreOpInst, +/// Get the operation that should act as a dominance filter while replacing +/// memref uses with a private memref for which `producerStores` and +/// `sliceInsertionBlock` are provided. This effectively determines in what +/// part of the IR we should be performing the replacement. +static Operation * +getDominanceFilterForPrivateMemRefRepl(Block *sliceInsertionBlock, + ArrayRef producerStores) { + assert(!producerStores.empty() && "expected producer store"); + + // We first find the common block that contains the producer stores and + // the slice computation. The first ancestor among the ancestors of the + // producer stores in that common block is the dominance filter to use for + // replacement. + Block *commonBlock = nullptr; + // Find the common block of all relevant operations. + for (Operation *store : producerStores) { + Operation *otherOp = + !commonBlock ? &*sliceInsertionBlock->begin() : &*commonBlock->begin(); + commonBlock = findInnermostCommonBlockInScope(store, otherOp); + } + assert(commonBlock && + "common block of producer stores and slice should exist"); + + // Find the first ancestor among the ancestors of `producerStores` in + // `commonBlock`. + Operation *firstAncestor = nullptr; + for (Operation *store : producerStores) { + Operation *ancestor = commonBlock->findAncestorOpInBlock(*store); + assert(ancestor && "producer store should be contained in common block"); + firstAncestor = !firstAncestor || ancestor->isBeforeInBlock(firstAncestor) + ? ancestor + : firstAncestor; + } + return firstAncestor; +} + +// Creates and returns a private (single-user) memref for fused loop rooted at +// 'forOp', with (potentially reduced) memref size based on the memref region +// written to by `storeOps` at depth 'dstLoopDepth'. 'sliceInsertionBlock' +// specifies the block in which the slice was/will be inserted. +static Value createPrivateMemRef(AffineForOp forOp, + ArrayRef storeOps, unsigned dstLoopDepth, std::optional fastMemorySpace, + Block *sliceInsertionBlock, uint64_t localBufSizeThreshold) { - Operation *forInst = forOp.getOperation(); + assert(!storeOps.empty() && "no source stores supplied"); + Operation *srcStoreOp = storeOps[0]; // Create builder to insert alloc op just before 'forOp'. - OpBuilder b(forInst); + OpBuilder b(forOp); // Builder to create constants at the top level. - OpBuilder top(forInst->getParentRegion()); + OpBuilder top(forOp->getParentRegion()); // Create new memref type based on slice bounds. - auto oldMemRef = cast(srcStoreOpInst).getMemRef(); + auto oldMemRef = cast(srcStoreOp).getMemRef(); auto oldMemRefType = cast(oldMemRef.getType()); unsigned rank = oldMemRefType.getRank(); // Compute MemRefRegion for 'srcStoreOpInst' at depth 'dstLoopDepth'. - MemRefRegion region(srcStoreOpInst->getLoc()); - bool validRegion = succeeded(region.compute(srcStoreOpInst, dstLoopDepth)); + MemRefRegion region(srcStoreOp->getLoc()); + bool validRegion = succeeded(region.compute(srcStoreOp, dstLoopDepth)); (void)validRegion; assert(validRegion && "unexpected memref region failure"); SmallVector newShape; @@ -332,11 +370,12 @@ static Value createPrivateMemRef(AffineForOp forOp, Operation *srcStoreOpInst, AffineMap::get(outerIVs.size() + rank, 0, remapExprs, forOp.getContext()); // Replace all users of 'oldMemRef' with 'newMemRef'. - LogicalResult res = - replaceAllMemRefUsesWith(oldMemRef, newMemRef, {}, indexRemap, - /*extraOperands=*/outerIVs, - /*symbolOperands=*/{}, - /*domOpFilter=*/&*forOp.getBody()->begin()); + Operation *domFilter = + getDominanceFilterForPrivateMemRefRepl(sliceInsertionBlock, storeOps); + LogicalResult res = replaceAllMemRefUsesWith( + oldMemRef, newMemRef, /*extraIndices=*/{}, indexRemap, + /*extraOperands=*/outerIVs, + /*symbolOperands=*/{}, domFilter); assert(succeeded(res) && "replaceAllMemrefUsesWith should always succeed here"); (void)res; @@ -944,6 +983,10 @@ struct GreedyFusion { // Create private memrefs. if (!privateMemrefs.empty()) { + // Note the block into which fusion was performed. This can be used to + // place `alloc`s that create private memrefs. + Block *sliceInsertionBlock = bestSlice.insertPoint->getBlock(); + // Gather stores for all the private-to-be memrefs. DenseMap> privateMemRefToStores; dstAffineForOp.walk([&](AffineWriteOpInterface storeOp) { @@ -962,8 +1005,8 @@ struct GreedyFusion { SmallVector &storesForMemref = memrefToStoresPair.second; Value newMemRef = createPrivateMemRef( - dstAffineForOp, storesForMemref[0], bestDstLoopDepth, - fastMemorySpace, localBufSizeThreshold); + dstAffineForOp, storesForMemref, bestDstLoopDepth, + fastMemorySpace, sliceInsertionBlock, localBufSizeThreshold); // Create new node in dependence graph for 'newMemRef' alloc op. unsigned newMemRefNodeId = mdg->addNode(newMemRef.getDefiningOp()); // Add edge from 'newMemRef' node to dstNode. diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir index ea144f73bb21c..2830235431c76 100644 --- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir +++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir @@ -285,3 +285,63 @@ module { spirv.ReturnValue %3 : !spirv.array<8192 x f32> } } + +// ----- + +// PRODUCER-CONSUMER-LABEL: func @same_memref_load_store +func.func @same_memref_load_store(%producer : memref<32xf32>, %consumer: memref<16xf32>){ + %cst = arith.constant 2.000000e+00 : f32 + // Source isn't removed. + // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32 + affine.for %arg3 = 0 to 32 { + %0 = affine.load %producer[%arg3] : memref<32xf32> + %2 = arith.mulf %0, %cst : f32 + affine.store %2, %producer[%arg3] : memref<32xf32> + } + affine.for %arg3 = 0 to 16 { + %0 = affine.load %producer[%arg3] : memref<32xf32> + %2 = arith.addf %0, %cst : f32 + affine.store %2, %consumer[%arg3] : memref<16xf32> + } + // Fused nest. + // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 16 + // PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<32xf32> + // PRODUCER-CONSUMER-NEXT: arith.mulf + // PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> + // PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32> + // PRODUCER-CONSUMER-NEXT: arith.addf + // PRODUCER-CONSUMER-NEXT: affine.store + // PRODUCER-CONSUMER-NEXT: } + return +} + +// PRODUCER-CONSUMER-LABEL: func @same_memref_load_multiple_stores +func.func @same_memref_load_multiple_stores(%producer : memref<32xf32>, %producer_2 : memref<32xf32>, %consumer: memref<16xf32>){ + %cst = arith.constant 2.000000e+00 : f32 + // Source isn't removed. + // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 32 + affine.for %arg3 = 0 to 32 { + %0 = affine.load %producer[%arg3] : memref<32xf32> + %2 = arith.mulf %0, %cst : f32 + affine.store %2, %producer[%arg3] : memref<32xf32> + affine.store %2, %producer_2[%arg3] : memref<32xf32> + } + affine.for %arg3 = 0 to 16 { + %0 = affine.load %producer[%arg3] : memref<32xf32> + %1 = affine.load %producer_2[%arg3] : memref<32xf32> + %2 = arith.addf %0, %1 : f32 + affine.store %2, %consumer[%arg3] : memref<16xf32> + } + // Fused nest. + // PRODUCER-CONSUMER: affine.for %{{.*}} = 0 to 16 + // PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<32xf32> + // PRODUCER-CONSUMER-NEXT: arith.mulf + // PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> + // PRODUCER-CONSUMER-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> + // PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32> + // PRODUCER-CONSUMER-NEXT: affine.load %{{.*}}[0] : memref<1xf32> + // PRODUCER-CONSUMER-NEXT: arith.addf + // PRODUCER-CONSUMER-NEXT: affine.store + // PRODUCER-CONSUMER-NEXT: } + return +} From 9d5edc9a0dd35049017aad2a9d3f4a4a2746fec9 Mon Sep 17 00:00:00 2001 From: Augusto Noronha Date: Fri, 7 Feb 2025 19:12:35 -0800 Subject: [PATCH 040/293] [lldb][NFC] Replace GetLocalBufferSize() with GetLocalBuffer() (#126333) --- lldb/include/lldb/ValueObject/ValueObject.h | 17 +++++++++-------- lldb/source/ValueObject/ValueObject.cpp | 12 ++++++------ .../ValueObject/ValueObjectDynamicValue.cpp | 2 +- .../DynamicValueObjectLocalBuffer.cpp | 5 +---- 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/lldb/include/lldb/ValueObject/ValueObject.h b/lldb/include/lldb/ValueObject/ValueObject.h index c8d5c2723106d..a0f53d20327cd 100644 --- a/lldb/include/lldb/ValueObject/ValueObject.h +++ b/lldb/include/lldb/ValueObject/ValueObject.h @@ -865,17 +865,18 @@ class ValueObject { virtual void SetLanguageFlags(uint64_t flags) { m_language_flags = flags; } - /// Returns the size of the local buffer if it's available. + /// Returns the local buffer that this ValueObject points to if it's + /// available. /// \return - /// The size of the local buffer if this value object's value points to a - /// host address, and if that size can be determined. Otherwise, returns - /// LLDB_INVALID_ADDRESS. + /// The local buffer if this value object's value points to a + /// host address, and if that buffer can be determined. Otherwise, returns + /// an empty ArrayRef. /// /// TODO: Because a ValueObject's Value can point to any arbitrary memory - /// location, it is possible that the size of the local buffer can't be - /// determined at all. See the comment in Value::m_value for a more thorough - /// explanation of why that is. - uint64_t GetLocalBufferSize(); + /// location, it is possible that we can't find what what buffer we're + /// pointing to, and thus also can't know its size. See the comment in + /// Value::m_value for a more thorough explanation of why that is. + llvm::ArrayRef GetLocalBuffer() const; protected: typedef ClusterManager ValueObjectManager; diff --git a/lldb/source/ValueObject/ValueObject.cpp b/lldb/source/ValueObject/ValueObject.cpp index 551d882a48d40..9d98f62c0379b 100644 --- a/lldb/source/ValueObject/ValueObject.cpp +++ b/lldb/source/ValueObject/ValueObject.cpp @@ -849,20 +849,20 @@ bool ValueObject::SetData(DataExtractor &data, Status &error) { return true; } -uint64_t ValueObject::GetLocalBufferSize() { +llvm::ArrayRef ValueObject::GetLocalBuffer() const { if (m_value.GetValueType() != Value::ValueType::HostAddress) - return LLDB_INVALID_ADDRESS; + return {}; auto start = m_value.GetScalar().ULongLong(LLDB_INVALID_ADDRESS); if (start == LLDB_INVALID_ADDRESS) - return LLDB_INVALID_ADDRESS; + return {}; // Does our pointer point to this value object's m_data buffer? if ((uint64_t)m_data.GetDataStart() == start) - return m_data.GetByteSize(); + return m_data.GetData(); // Does our pointer point to the value's buffer? if ((uint64_t)m_value.GetBuffer().GetBytes() == start) - return m_value.GetBuffer().GetByteSize(); + return m_value.GetBuffer().GetData(); // Our pointer points to something else. We can't know what the size is. - return LLDB_INVALID_ADDRESS; + return {}; } static bool CopyStringDataToBufferSP(const StreamString &source, diff --git a/lldb/source/ValueObject/ValueObjectDynamicValue.cpp b/lldb/source/ValueObject/ValueObjectDynamicValue.cpp index dddb0f0700b38..ecd663af68c2d 100644 --- a/lldb/source/ValueObject/ValueObjectDynamicValue.cpp +++ b/lldb/source/ValueObject/ValueObjectDynamicValue.cpp @@ -241,7 +241,7 @@ bool ValueObjectDynamicValue::UpdateValue() { SetValueDidChange(true); // If we found a host address, and the dynamic type fits in the local buffer - // that was found, point to thar buffer. Later on this function will copy + // that was found, point to that buffer. Later on this function will copy // the buffer over. if (value_type == Value::ValueType::HostAddress && !local_buffer.empty()) { auto *exe_scope = exe_ctx.GetBestExecutionContextScope(); diff --git a/lldb/unittests/ValueObject/DynamicValueObjectLocalBuffer.cpp b/lldb/unittests/ValueObject/DynamicValueObjectLocalBuffer.cpp index e3cf0f8a87bd2..417708dd2dc22 100644 --- a/lldb/unittests/ValueObject/DynamicValueObjectLocalBuffer.cpp +++ b/lldb/unittests/ValueObject/DynamicValueObjectLocalBuffer.cpp @@ -66,11 +66,8 @@ struct MockLanguageRuntime : public LanguageRuntime { *ast, "TypeWitInt", ast->GetBasicType(lldb::BasicType::eBasicTypeInt), "theIntField", LanguageType::eLanguageTypeC_plus_plus); class_type_or_name.SetCompilerType(int_type); - local_buffer = {(uint8_t *)in_value.GetValue().GetScalar().ULongLong( - LLDB_INVALID_ADDRESS), - static_cast(in_value.GetLocalBufferSize())}; + local_buffer = in_value.GetLocalBuffer(); value_type = Value::ValueType::HostAddress; - return true; } From de12bf508970ef9c0612c3950410530c4b822e6e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 7 Feb 2025 21:37:21 -0800 Subject: [PATCH 041/293] [RISCV] Refactor tablegen classes to push common values down to VPseudoBinaryM. NFC (#126339) Move VPseudoBinaryM ajacent to its only users. --- .../Target/RISCV/RISCVInstrInfoVPseudos.td | 90 +++++++++---------- 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 268bfe70673a2..fe85d4b074c87 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -2136,23 +2136,6 @@ multiclass VPseudoBinaryRoundingMode TargetConstraintType = 1, - bit Commutable = 0> { - let VLMul = MInfo.value, isCommutable = Commutable in { - def "_" # MInfo.MX : VPseudoBinaryNoMask; - let ForceTailAgnostic = true in - def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMOutMask, - RISCVMaskedPseudo; - } -} - multiclass VPseudoBinaryEmul TargetConstraintType = 1, - bit Commutable = 0> { - defm _VV : VPseudoBinaryM { + let VLMul = m.value, isCommutable = Commutable in { + def "_" # m.MX : + VPseudoBinaryNoMask; + let ForceTailAgnostic = true in + def "_" # m.MX # "_MASK" : + VPseudoBinaryMOutMask; + TargetConstraintType = 2>, + RISCVMaskedPseudo; + } } -multiclass VPseudoBinaryM_VX TargetConstraintType = 1> { - defm "_VX" : - VPseudoBinaryM; +multiclass VPseudoBinaryM_VV { + defm _VV : VPseudoBinaryM; } -multiclass VPseudoBinaryM_VF TargetConstraintType = 1> { - defm "_V" # f.FX : - VPseudoBinaryM; +multiclass VPseudoBinaryM_VX { + defm _VX : VPseudoBinaryM; } -multiclass VPseudoBinaryM_VI TargetConstraintType = 1> { - defm _VI : VPseudoBinaryM; +multiclass VPseudoBinaryM_VF { + defm "_V" # f.FX : VPseudoBinaryM; +} + +multiclass VPseudoBinaryM_VI { + defm _VI : VPseudoBinaryM; } multiclass VPseudoVGTR_VV_VX_VI { @@ -3397,11 +3387,11 @@ multiclass VPseudoVWMAC_VV_VF_BF_RM { multiclass VPseudoVCMPM_VV_VX_VI { foreach m = MxList in { defvar mx = m.MX; - defm "" : VPseudoBinaryM_VV, + defm "" : VPseudoBinaryM_VV, SchedBinary<"WriteVICmpV", "ReadVICmpV", "ReadVICmpV", mx>; - defm "" : VPseudoBinaryM_VX, + defm "" : VPseudoBinaryM_VX, SchedBinary<"WriteVICmpX", "ReadVICmpV", "ReadVICmpX", mx>; - defm "" : VPseudoBinaryM_VI, + defm "" : VPseudoBinaryM_VI, SchedUnary<"WriteVICmpI", "ReadVICmpV", mx>; } } @@ -3409,22 +3399,32 @@ multiclass VPseudoVCMPM_VV_VX_VI { multiclass VPseudoVCMPM_VV_VX { foreach m = MxList in { defvar mx = m.MX; - defm "" : VPseudoBinaryM_VV, + defm "" : VPseudoBinaryM_VV, SchedBinary<"WriteVICmpV", "ReadVICmpV", "ReadVICmpV", mx>; - defm "" : VPseudoBinaryM_VX, + defm "" : VPseudoBinaryM_VX, + SchedBinary<"WriteVICmpX", "ReadVICmpV", "ReadVICmpX", mx>; + } +} + +multiclass VPseudoVCMPM_VX_VI { + foreach m = MxList in { + defvar mx = m.MX; + defm "" : VPseudoBinaryM_VX, SchedBinary<"WriteVICmpX", "ReadVICmpV", "ReadVICmpX", mx>; + defm "" : VPseudoBinaryM_VI, + SchedUnary<"WriteVICmpI", "ReadVICmpV", mx>; } } multiclass VPseudoVCMPM_VV_VF { foreach m = MxListF in { - defm "" : VPseudoBinaryM_VV, + defm "" : VPseudoBinaryM_VV, SchedBinary<"WriteVFCmpV", "ReadVFCmpV", "ReadVFCmpV", m.MX>; } foreach f = FPList in { foreach m = f.MxList in { - defm "" : VPseudoBinaryM_VF, + defm "" : VPseudoBinaryM_VF, SchedBinary<"WriteVFCmpF", "ReadVFCmpV", "ReadVFCmpF", m.MX>; } } @@ -3433,22 +3433,12 @@ multiclass VPseudoVCMPM_VV_VF { multiclass VPseudoVCMPM_VF { foreach f = FPList in { foreach m = f.MxList in { - defm "" : VPseudoBinaryM_VF, + defm "" : VPseudoBinaryM_VF, SchedBinary<"WriteVFCmpF", "ReadVFCmpV", "ReadVFCmpF", m.MX>; } } } -multiclass VPseudoVCMPM_VX_VI { - foreach m = MxList in { - defvar mx = m.MX; - defm "" : VPseudoBinaryM_VX, - SchedBinary<"WriteVICmpX", "ReadVICmpV", "ReadVICmpX", mx>; - defm "" : VPseudoBinaryM_VI, - SchedUnary<"WriteVICmpI", "ReadVICmpV", mx>; - } -} - multiclass VPseudoVRED_VS { foreach m = MxList in { defvar mx = m.MX; From e0a21e23a7aa6acf3e07b866c3c599db5eb4b67f Mon Sep 17 00:00:00 2001 From: Gedare Bloom Date: Fri, 7 Feb 2025 23:10:35 -0700 Subject: [PATCH 042/293] [clang-format] Add BinPackLongBracedList style option (#112482) The use of Cpp11BracedListStyle with BinPackArguments=False avoids bin packing until reaching a hard-coded limit of 20 items. This is an arbitrary choice. Introduce a new style option to allow disabling this limit. --- clang/docs/ClangFormatStyleOptions.rst | 18 +++++++++ clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/Format/Format.h | 17 ++++++++ clang/lib/Format/Format.cpp | 2 + clang/lib/Format/FormatToken.cpp | 2 +- clang/unittests/Format/ConfigParseTest.cpp | 1 + clang/unittests/Format/FormatTest.cpp | 45 ++++++++++++++++++++++ 7 files changed, 86 insertions(+), 1 deletion(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index ce38a3a9ba1f7..bf6dd9e13915f 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -2182,6 +2182,24 @@ the configuration (without a prefix: ``Auto``). aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa); } +.. _BinPackLongBracedList: + +**BinPackLongBracedList** (``Boolean``) :versionbadge:`clang-format 21` :ref:`¶ ` + If ``BinPackLongBracedList`` is ``true`` it overrides + ``BinPackArguments`` if there are 20 or more items in a braced + initializer list. + + .. code-block:: c++ + + BinPackLongBracedList: false vs. BinPackLongBracedList: true + vector x{ vector x{1, 2, ..., + 20, 21}; + 1, + 2, + ..., + 20, + 21}; + .. _BinPackParameters: **BinPackParameters** (``BinPackParametersStyle``) :versionbadge:`clang-format 3.7` :ref:`¶ ` diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 92f63c1503089..03997395f56d8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -237,6 +237,8 @@ clang-format ------------ - Adds ``BreakBeforeTemplateCloser`` option. +- Adds ``BinPackLongBracedList`` option to override bin packing options in + long (20 item or more) braced list initializer lists. libclang -------- diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index fbc9291ae950d..16956b4e0fbd4 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -1212,6 +1212,22 @@ struct FormatStyle { /// \version 3.7 bool BinPackArguments; + /// If ``BinPackLongBracedList`` is ``true`` it overrides + /// ``BinPackArguments`` if there are 20 or more items in a braced + /// initializer list. + /// \code + /// BinPackLongBracedList: false vs. BinPackLongBracedList: true + /// vector x{ vector x{1, 2, ..., + /// 20, 21}; + /// 1, + /// 2, + /// ..., + /// 20, + /// 21}; + /// \endcode + /// \version 21 + bool BinPackLongBracedList; + /// Different way to try to fit all parameters on a line. enum BinPackParametersStyle : int8_t { /// Bin-pack parameters. @@ -5266,6 +5282,7 @@ struct FormatStyle { R.AlwaysBreakBeforeMultilineStrings && AttributeMacros == R.AttributeMacros && BinPackArguments == R.BinPackArguments && + BinPackLongBracedList == R.BinPackLongBracedList && BinPackParameters == R.BinPackParameters && BitFieldColonSpacing == R.BitFieldColonSpacing && BracedInitializerIndentWidth == R.BracedInitializerIndentWidth && diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 387daad934f67..0898b69528ebc 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -995,6 +995,7 @@ template <> struct MappingTraits { Style.AlwaysBreakBeforeMultilineStrings); IO.mapOptional("AttributeMacros", Style.AttributeMacros); IO.mapOptional("BinPackArguments", Style.BinPackArguments); + IO.mapOptional("BinPackLongBracedList", Style.BinPackLongBracedList); IO.mapOptional("BinPackParameters", Style.BinPackParameters); IO.mapOptional("BitFieldColonSpacing", Style.BitFieldColonSpacing); IO.mapOptional("BracedInitializerIndentWidth", @@ -1507,6 +1508,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.AlwaysBreakBeforeMultilineStrings = false; LLVMStyle.AttributeMacros.push_back("__capability"); LLVMStyle.BinPackArguments = true; + LLVMStyle.BinPackLongBracedList = true; LLVMStyle.BinPackParameters = FormatStyle::BPPS_BinPack; LLVMStyle.BitFieldColonSpacing = FormatStyle::BFCS_Both; LLVMStyle.BracedInitializerIndentWidth = std::nullopt; diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index 99bce1f5f0985..fb040a0043602 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -175,7 +175,7 @@ void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) { // have many items (20 or more) or we allow bin-packing of function call // arguments. if (Style.Cpp11BracedListStyle && !Style.BinPackArguments && - Commas.size() < 19) { + (Commas.size() < 19 || !Style.BinPackLongBracedList)) { return; } diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 0cb2a1288bfd7..9cd262960b724 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -168,6 +168,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { CHECK_PARSE_BOOL(AllowShortLoopsOnASingleLine); CHECK_PARSE_BOOL(AllowShortNamespacesOnASingleLine); CHECK_PARSE_BOOL(BinPackArguments); + CHECK_PARSE_BOOL(BinPackLongBracedList); CHECK_PARSE_BOOL(BreakAdjacentStringLiterals); CHECK_PARSE_BOOL(BreakAfterJavaFieldAnnotations); CHECK_PARSE_BOOL(BreakBeforeTemplateCloser); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index a9fddc3275aed..9b9ce35f83bc5 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -14420,6 +14420,51 @@ TEST_F(FormatTest, LayoutCxx11BraceInitializers) { "};", NoBinPacking); + NoBinPacking.BinPackLongBracedList = false; + verifyFormat("const Aaaaaa aaaaa = {aaaaa,\n" + " bbbbb,\n" + " ccccc,\n" + " ddddd,\n" + " eeeee,\n" + " ffffff,\n" + " ggggg,\n" + " hhhhhh,\n" + " iiiiii,\n" + " jjjjjj,\n" + " kkkkkk,\n" + " aaaaa,\n" + " bbbbb,\n" + " ccccc,\n" + " ddddd,\n" + " eeeee,\n" + " ffffff,\n" + " ggggg,\n" + " hhhhhh,\n" + " iiiiii};", + NoBinPacking); + verifyFormat("const Aaaaaa aaaaa = {\n" + " aaaaa,\n" + " bbbbb,\n" + " ccccc,\n" + " ddddd,\n" + " eeeee,\n" + " ffffff,\n" + " ggggg,\n" + " hhhhhh,\n" + " iiiiii,\n" + " jjjjjj,\n" + " kkkkkk,\n" + " aaaaa,\n" + " bbbbb,\n" + " ccccc,\n" + " ddddd,\n" + " eeeee,\n" + " ffffff,\n" + " ggggg,\n" + " hhhhhh,\n" + "};", + NoBinPacking); + NoBinPacking.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; verifyFormat("static uint8 CddDp83848Reg[] = {\n" " CDDDP83848_BMCR_REGISTER,\n" From 6f241e36831927e3aea113cfc017c34fdeda340a Mon Sep 17 00:00:00 2001 From: Yanzuo Liu Date: Sat, 8 Feb 2025 14:31:47 +0800 Subject: [PATCH 043/293] [Clang][Sema] Fix wrong initialization kind when handling initializing structured bindings from an array with direct-list-initialization (#124793) In 377257f063c, elements of structured bindings are copy-initialized. They should be direct-initialized because the form of the initializer of the whole structured bindings is a direct-list-initialization. > [dcl.struct.bind]/1: > ... and each element is copy-initialized or direct-initialized from the corresponding element of the assignment-expression as specified by the form of the initializer. ... For example, ```cpp int arr[2]{}; // elements of `[a, b]` should be direct-initialized auto [a, b]{arr}; ``` --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/SemaInit.cpp | 8 +++-- clang/test/SemaCXX/cxx1z-decomposition.cpp | 42 ++++++++++------------ 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 03997395f56d8..50d3bbbc97e91 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -151,6 +151,8 @@ Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ - Clang is now better at keeping track of friend function template instance contexts. (#GH55509) +- The initialization kind of elements of structured bindings + direct-list-initialized from an array is corrected to direct-initialization. Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index f206cd57eca89..308222a79d920 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -4862,9 +4862,13 @@ static void TryListInitialization(Sema &S, assert( S.Context.hasSameUnqualifiedType(SubInit[0]->getType(), DestType) && "Deduced to other type?"); + assert(Kind.getKind() == clang::InitializationKind::IK_DirectList && + "List-initialize structured bindings but not " + "direct-list-initialization?"); TryArrayCopy(S, - InitializationKind::CreateCopy(Kind.getLocation(), - InitList->getLBraceLoc()), + InitializationKind::CreateDirect(Kind.getLocation(), + InitList->getLBraceLoc(), + InitList->getRBraceLoc()), Entity, SubInit[0], DestType, Sequence, TreatUnavailableAsInvalid); if (Sequence) diff --git a/clang/test/SemaCXX/cxx1z-decomposition.cpp b/clang/test/SemaCXX/cxx1z-decomposition.cpp index a8914fe4e9cd8..95c64bc3b8bff 100644 --- a/clang/test/SemaCXX/cxx1z-decomposition.cpp +++ b/clang/test/SemaCXX/cxx1z-decomposition.cpp @@ -200,38 +200,32 @@ namespace lambdas { namespace by_value_array_copy { struct explicit_copy { - explicit_copy() = default; // expected-note 2{{candidate constructor not viable: requires 0 arguments, but 1 was provided}} - explicit explicit_copy(const explicit_copy&) = default; // expected-note 2{{explicit constructor is not a candidate}} + explicit_copy() = default; // expected-note {{candidate constructor not viable: requires 0 arguments, but 1 was provided}} + explicit explicit_copy(const explicit_copy&) = default; // expected-note {{explicit constructor is not a candidate}} }; - constexpr int direct_initialization_for_elements() { - explicit_copy ec_arr[2]; - auto [a1, b1](ec_arr); + constexpr int simple_array_elements() { + int arr[2]{1, 2}; - int arr[3]{1, 2, 3}; - auto [a2, b2, c2](arr); - arr[0]--; - return a2 + b2 + c2 + arr[0]; - } - static_assert(direct_initialization_for_elements() == 6); + auto [a1, a2] = arr; + auto [b1, b2](arr); + auto [c1, c2]{arr}; // GH31813 - constexpr int copy_initialization_for_elements() { - int arr[2]{4, 5}; - auto [a1, b1] = arr; - auto [a2, b2]{arr}; // GH31813 arr[0] = 0; - return a1 + b1 + a2 + b2 + arr[0]; + return arr[0] + a1 + a2 + b1 + b2 + c1 + c2; } - static_assert(copy_initialization_for_elements() == 18); + static_assert(simple_array_elements() == 9); + + void explicit_copy_ctor_array_elements() { + explicit_copy ec_arr[1]; - void copy_initialization_for_elements_with_explicit_copy_ctor() { - explicit_copy ec_arr[2]; - auto [a1, b1] = ec_arr; // expected-error {{no matching constructor for initialization of 'explicit_copy[2]'}} - auto [a2, b2]{ec_arr}; // expected-error {{no matching constructor for initialization of 'explicit_copy[2]'}} + auto [a] = ec_arr; // expected-error {{no matching constructor for initialization of 'explicit_copy[1]'}} + auto [b](ec_arr); + auto [c]{ec_arr}; // Test prvalue - using T = explicit_copy[2]; - auto [a3, b3] = T{}; - auto [a4, b4]{T{}}; + using T = explicit_copy[1]; + auto [d] = T{}; } + } // namespace by_value_array_copy From 1c497c4837e82e23589b29e3ce0aedd3f461018b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 00:48:01 -0800 Subject: [PATCH 044/293] [CodeGen] Avoid repeated hash lookups (NFC) (#126343) --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 460749a739c76..4d9d7128f73a8 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -1065,8 +1065,9 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { assert(!MRI->isReserved(DestReg)); // Update matching debug values, if any. - SmallVector MaybeDeadDbgUsers( - CopyDbgUsers[MaybeDead].begin(), CopyDbgUsers[MaybeDead].end()); + const auto &DbgUsers = CopyDbgUsers[MaybeDead]; + SmallVector MaybeDeadDbgUsers(DbgUsers.begin(), + DbgUsers.end()); MRI->updateDbgUsersToReg(DestReg.asMCReg(), SrcReg.asMCReg(), MaybeDeadDbgUsers); @@ -1238,8 +1239,9 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( isCopyInstr(*Copy, *TII, UseCopyInstr); Register Src = CopyOperands->Source->getReg(); Register Def = CopyOperands->Destination->getReg(); - SmallVector MaybeDeadDbgUsers(CopyDbgUsers[Copy].begin(), - CopyDbgUsers[Copy].end()); + const auto &DbgUsers = CopyDbgUsers[Copy]; + SmallVector MaybeDeadDbgUsers(DbgUsers.begin(), + DbgUsers.end()); MRI->updateDbgUsersToReg(Src.asMCReg(), Def.asMCReg(), MaybeDeadDbgUsers); Copy->eraseFromParent(); From dbe812220c3100ece253feb72d65172780ef723b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 00:48:30 -0800 Subject: [PATCH 045/293] [TableGen] Avoid repeated hash lookups (NFC) (#126344) --- llvm/utils/TableGen/X86InstrMappingEmitter.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp index 1ee79aa27fa98..df43f39e0e9be 100644 --- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp +++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp @@ -228,8 +228,9 @@ void X86InstrMappingEmitter::emitCompressEVEXTable( // For each pre-compression instruction look for a match in the // appropriate vector (instructions with the same opcode) using function // object IsMatch. - auto Match = llvm::find_if(CompressedInsts[Opcode], IsMatch(Inst)); - if (Match != CompressedInsts[Opcode].end()) + const auto &Insts = CompressedInsts[Opcode]; + auto Match = llvm::find_if(Insts, IsMatch(Inst)); + if (Match != Insts.end()) NewInst = *Match; } From 5901bda5a0ed31e024abc8a7af52b272400daa08 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 00:48:51 -0800 Subject: [PATCH 046/293] [Vectorize] Avoid repeated hash lookups (NFC) (#126345) --- llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 0b40b733ffe7e..04b392829f0d7 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -1335,8 +1335,9 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const { const auto &Key = EC.first; EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key), std::get<3>(Key)}; - RedKeyToUOMap[RedKey].insert(std::get<0>(Key)); - if (RedKeyToUOMap[RedKey].size() > 1) + auto &UOMap = RedKeyToUOMap[RedKey]; + UOMap.insert(std::get<0>(Key)); + if (UOMap.size() > 1) FoundPotentiallyOptimizableEC = true; } if (!FoundPotentiallyOptimizableEC) From 95922d83341f3476bdc2eccd524a02d9a4ab80da Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 00:49:42 -0800 Subject: [PATCH 047/293] [dsymutil] Avoid repeated hash lookups (NFC) (#126190) (#126346) --- llvm/tools/dsymutil/BinaryHolder.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/tools/dsymutil/BinaryHolder.cpp b/llvm/tools/dsymutil/BinaryHolder.cpp index 5daaa6755b295..7588a33eb46b2 100644 --- a/llvm/tools/dsymutil/BinaryHolder.cpp +++ b/llvm/tools/dsymutil/BinaryHolder.cpp @@ -176,8 +176,8 @@ BinaryHolder::ArchiveEntry::getObjectEntry(StringRef Filename, // Try the cache first. std::lock_guard Lock(MemberCacheMutex); - if (MemberCache.count(Key)) - return *MemberCache[Key]; + if (auto It = MemberCache.find(Key); It != MemberCache.end()) + return *It->second; // Create a new ObjectEntry, but don't add it to the cache yet. Loading of // the archive members might fail and we don't want to lock the whole archive @@ -228,8 +228,7 @@ BinaryHolder::ArchiveEntry::getObjectEntry(StringRef Filename, if (OE->Objects.empty()) return errorCodeToError(errc::no_such_file_or_directory); - MemberCache[Key] = std::move(OE); - return *MemberCache[Key]; + return *(MemberCache[Key] = std::move(OE)); } Expected From 027aa70ea44502280779c3887c72886326785c6b Mon Sep 17 00:00:00 2001 From: Thomas Preud'homme Date: Sat, 8 Feb 2025 09:23:32 +0000 Subject: [PATCH 048/293] [TOSA] Fix negate maxValue computation (#126295) getInput1Zp() returns an unsigned value which means in case of negative zero point value the max intermediate value computation currently goes wrong. Use getInput1ZpAttr() instead which returns an APInt and allows easy sign extension to int64_t. --- mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp | 10 ++++++---- .../Conversion/TosaToLinalg/tosa-to-linalg.mlir | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index e4f055ea2f5c4..0246d9019368a 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -146,11 +146,13 @@ static Value createLinalgBodyCalculationForElementwiseOp( return rewriter.create(loc, resultTypes, args); if (isa(elementTy)) { - auto inputZpAttr = cast(op).getInput1Zp(); - auto outputZpAttr = cast(op).getOutputZp(); + auto inputZpAttr = cast(op).getInput1ZpAttr(); + auto outputZpAttr = cast(op).getOutputZpAttr(); - const int64_t inZp = inputZpAttr ? *inputZpAttr : 0; - const int64_t outZp = outputZpAttr ? *outputZpAttr : 0; + const int64_t inZp = + inputZpAttr ? inputZpAttr.getValue().getSExtValue() : 0; + const int64_t outZp = + outputZpAttr ? outputZpAttr.getValue().getSExtValue() : 0; if (!inZp && !outZp) { auto constant = rewriter.create( diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index 3031434e6d4ba..d8ba28a3ce887 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -911,12 +911,25 @@ func.func @test_negate_quantized(%arg0: tensor<1xi8>) -> () { // CHECK: linalg.yield [[TRUNC]] %2 = tosa.negate %arg0 {input1_zp = 32640 : i32, output_zp = 0 : i32} : (tensor<1xi8>) -> tensor<1xi8> + // CHECK: linalg.generic + // CHECK: ^bb0(%[[BBARG0:.+]]: i8, + // CHECK: [[C_128:%.+]] = arith.constant -128 + // CHECK: [[EXT:%.+]] = arith.extsi %[[BBARG0]] : i8 to i16 + // CHECK: [[SUB:%.+]] = arith.subi [[C_128]], [[EXT]] + // CHECK: [[MIN:%.+]] = arith.constant -128 + // CHECK: [[MAX:%.+]] = arith.constant 127 + // CHECK: [[LBOUND:%.+]] = arith.maxsi [[MIN]], [[SUB]] + // CHECK: [[UBOUND:%.+]] = arith.minsi [[MAX]], [[LBOUND]] + // CHECK: [[TRUNC:%.+]] = arith.trunci [[UBOUND]] + // CHECK: linalg.yield [[TRUNC]] + %3 = tosa.negate %arg0 {input1_zp = -128 : i32, output_zp = 0 : i32} : (tensor<1xi8>) -> tensor<1xi8> + // CHECK: linalg.generic // CHECK: ^bb0(%[[BBARG0:.+]]: i8, // CHECK: [[ZERO:%.+]] = arith.constant 0 // CHECK: [[SUB:%.+]] = arith.subi [[ZERO]], // CHECK: linalg.yield [[SUB]] - %3 = tosa.negate %arg0 {quantization_info = #tosa.unary_quant} : (tensor<1xi8>) -> tensor<1xi8> + %4 = tosa.negate %arg0 {quantization_info = #tosa.unary_quant} : (tensor<1xi8>) -> tensor<1xi8> return } From 564b9b7f4db05b5ce3558041b164f21dfe051a91 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Sat, 8 Feb 2025 15:36:48 +0530 Subject: [PATCH 049/293] Revert "CodeGen][NewPM] Port MachineScheduler to NPM. (#125703)" (#126268) This reverts commit 5aa4979c47255770cac7b557f3e4a980d0131d69 while I investigate what's causing the compile-time regression. --- llvm/include/llvm/CodeGen/MachineScheduler.h | 18 -- llvm/include/llvm/InitializePasses.h | 4 +- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 5 +- .../llvm/Passes/MachinePassRegistry.def | 4 +- llvm/lib/CodeGen/CodeGen.cpp | 4 +- llvm/lib/CodeGen/MachineScheduler.cpp | 287 +++++------------- llvm/lib/CodeGen/RegAllocBasic.cpp | 2 +- llvm/lib/CodeGen/RegAllocGreedy.cpp | 2 +- llvm/lib/Passes/PassBuilder.cpp | 1 - .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 - .../test/CodeGen/AArch64/a55-fuse-address.mir | 1 - .../CodeGen/AArch64/ampere1-sched-add.mir | 1 - .../CodeGen/AArch64/cluster-frame-index.mir | 1 - .../CodeGen/AArch64/dump-reserved-cycles.mir | 6 - .../CodeGen/AArch64/dump-schedule-trace.mir | 17 -- .../AArch64/force-enable-intervals.mir | 10 - .../CodeGen/AArch64/machine-scheduler.mir | 1 - .../macro-fusion-addsub-2reg-const1.mir | 2 - .../CodeGen/AArch64/macro-fusion-last.mir | 2 - .../AArch64/misched-branch-targets.mir | 3 - llvm/test/CodeGen/AArch64/misched-bundle.mir | 1 - .../misched-detail-resource-booking-01.mir | 8 - .../misched-detail-resource-booking-02.mir | 7 - .../AArch64/misched-fusion-arith-logic.mir | 2 - .../CodeGen/AArch64/misched-fusion-cmp.mir | 1 - .../AArch64/misched-fusion-crypto-eor.mir | 3 - .../test/CodeGen/AArch64/misched-move-imm.mir | 1 - .../AArch64/misched-predicate-virtreg.mir | 1 - .../misched-sort-resource-in-trace.mir | 10 - .../CodeGen/AArch64/sched-postidxalias.mir | 1 - .../CodeGen/AArch64/sched-print-cycle.mir | 6 - .../CodeGen/AArch64/scheduledag-constreg.mir | 1 - llvm/test/CodeGen/AArch64/sve-aliasing.mir | 1 - .../AMDGPU/at-least-one-def-value-assert.mir | 2 - .../CodeGen/AMDGPU/cluster-flat-loads.mir | 1 - .../AMDGPU/dbg-value-ends-sched-region.mir | 1 - .../AMDGPU/debug-value-scheduler-crash.mir | 1 - .../AMDGPU/debug-value-scheduler-liveins.mir | 1 - .../CodeGen/AMDGPU/debug-value-scheduler.mir | 1 - .../CodeGen/AMDGPU/flat-load-clustering.mir | 1 - .../CodeGen/AMDGPU/high-RP-reschedule.mir | 6 +- ...ne-scheduler-sink-trivial-remats-debug.mir | 1 - .../machine-scheduler-sink-trivial-remats.mir | 1 - .../AMDGPU/macro-fusion-cluster-vcc-uses.mir | 1 - ...ssert-dead-def-subreg-use-other-subreg.mir | 3 +- ...ched-assert-onlydbg-value-empty-region.mir | 1 - .../AMDGPU/sched-barrier-hang-weak-dep.mir | 1 - .../CodeGen/AMDGPU/sched-crash-dbg-value.mir | 1 - ...dleMoveUp-subreg-def-across-subreg-def.mir | 1 - .../AMDGPU/schedule-barrier-fpmode.mir | 2 - llvm/test/CodeGen/AMDGPU/schedule-barrier.mir | 1 - .../AMDGPU/sreg-xnull-regclass-bitwidth.mir | 1 - llvm/test/CodeGen/ARM/cortex-m7-wideops.mir | 1 - .../CodeGen/ARM/misched-branch-targets.mir | 2 - .../CodeGen/PowerPC/topdepthreduce-postra.mir | 1 - .../RISCV/misched-postra-direction.mir | 13 - 56 files changed, 95 insertions(+), 366 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index e1f1a1efecc72..4762494e6ccb7 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -1385,24 +1385,6 @@ std::unique_ptr createCopyConstrainDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI); -class MachineSchedulerPass : public PassInfoMixin { - const TargetMachine *TM; - -public: - MachineSchedulerPass(const TargetMachine *TM) : TM(TM) {} - PreservedAnalyses run(MachineFunction &MF, - MachineFunctionAnalysisManager &MFAM); -}; - -class PostMachineSchedulerPass - : public PassInfoMixin { - const TargetMachine *TM; - -public: - PostMachineSchedulerPass(const TargetMachine *TM) : TM(TM) {} - PreservedAnalyses run(MachineFunction &MF, - MachineFunctionAnalysisManager &MFAM); -}; } // end namespace llvm #endif // LLVM_CODEGEN_MACHINESCHEDULER_H diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index b8df4d1ecab1d..6d74d7f24bf9a 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -209,7 +209,7 @@ void initializeMachinePipelinerPass(PassRegistry &); void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &); void initializeMachineRegionInfoPassPass(PassRegistry &); void initializeMachineSanitizerBinaryMetadataPass(PassRegistry &); -void initializeMachineSchedulerLegacyPass(PassRegistry &); +void initializeMachineSchedulerPass(PassRegistry &); void initializeMachineSinkingPass(PassRegistry &); void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &); void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &); @@ -238,7 +238,7 @@ void initializePostDomPrinterWrapperPassPass(PassRegistry &); void initializePostDomViewerWrapperPassPass(PassRegistry &); void initializePostDominatorTreeWrapperPassPass(PassRegistry &); void initializePostInlineEntryExitInstrumenterPass(PassRegistry &); -void initializePostMachineSchedulerLegacyPass(PassRegistry &); +void initializePostMachineSchedulerPass(PassRegistry &); void initializePostRAHazardRecognizerPass(PassRegistry &); void initializePostRAMachineSinkingPass(PassRegistry &); void initializePostRASchedulerLegacyPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 1458318ff021a..7f91dd7ebf49d 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -50,7 +50,6 @@ #include "llvm/CodeGen/MachineLICM.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachinePassManager.h" -#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/MachineVerifier.h" #include "llvm/CodeGen/OptimizePHIs.h" #include "llvm/CodeGen/PHIElimination.h" @@ -961,7 +960,7 @@ Error CodeGenPassBuilder::addMachinePasses( if (getOptLevel() != CodeGenOptLevel::None && !TM.targetSchedulesPostRAScheduling()) { if (Opt.MISchedPostRA) - addPass(PostMachineSchedulerPass(&TM)); + addPass(PostMachineSchedulerPass()); else addPass(PostRASchedulerPass(&TM)); } @@ -1145,7 +1144,7 @@ void CodeGenPassBuilder::addOptimizedRegAlloc( addPass(RenameIndependentSubregsPass()); // PreRA instruction scheduling. - addPass(MachineSchedulerPass(&TM)); + addPass(MachineSchedulerPass()); if (derived().addRegAssignmentOptimized(addPass)) { // Allow targets to expand pseudo instructions depending on the choice of diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index e6b4a4b0a56ae..9f9922dfa5673 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -142,13 +142,11 @@ MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass()) MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass()) MACHINE_FUNCTION_PASS("machine-cp", MachineCopyPropagationPass()) MACHINE_FUNCTION_PASS("machine-cse", MachineCSEPass()) -MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass(TM)) MACHINE_FUNCTION_PASS("machinelicm", MachineLICMPass()) MACHINE_FUNCTION_PASS("no-op-machine-function", NoOpMachineFunctionPass()) MACHINE_FUNCTION_PASS("opt-phis", OptimizePHIsPass()) MACHINE_FUNCTION_PASS("peephole-opt", PeepholeOptimizerPass()) MACHINE_FUNCTION_PASS("phi-node-elimination", PHIEliminationPass()) -MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass(TM)) MACHINE_FUNCTION_PASS("post-RA-sched", PostRASchedulerPass(TM)) MACHINE_FUNCTION_PASS("print", PrintMIRPass()) MACHINE_FUNCTION_PASS("print", LiveDebugVariablesPrinterPass(errs())) @@ -245,11 +243,13 @@ DUMMY_MACHINE_FUNCTION_PASS("static-data-splitter", StaticDataSplitter) DUMMY_MACHINE_FUNCTION_PASS("machine-function-splitter", MachineFunctionSplitterPass) DUMMY_MACHINE_FUNCTION_PASS("machine-latecleanup", MachineLateInstrsCleanupPass) DUMMY_MACHINE_FUNCTION_PASS("machine-sanmd", MachineSanitizerBinaryMetadata) +DUMMY_MACHINE_FUNCTION_PASS("machine-scheduler", MachineSchedulerPass) DUMMY_MACHINE_FUNCTION_PASS("machine-sink", MachineSinkingPass) DUMMY_MACHINE_FUNCTION_PASS("machine-uniformity", MachineUniformityInfoWrapperPass) DUMMY_MACHINE_FUNCTION_PASS("machineinstr-printer", MachineFunctionPrinterPass) DUMMY_MACHINE_FUNCTION_PASS("mirfs-discriminators", MIRAddFSDiscriminatorsPass) DUMMY_MACHINE_FUNCTION_PASS("patchable-function", PatchableFunctionPass) +DUMMY_MACHINE_FUNCTION_PASS("postmisched", PostMachineSchedulerPass) DUMMY_MACHINE_FUNCTION_PASS("postra-machine-sink", PostRAMachineSinkingPass) DUMMY_MACHINE_FUNCTION_PASS("postrapseudos", ExpandPostRAPseudosPass) DUMMY_MACHINE_FUNCTION_PASS("print-machine-cycles", MachineCycleInfoPrinterPass) diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 35df2a479a545..d69a24f00871e 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -94,7 +94,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeModuloScheduleTestPass(Registry); initializeMachinePostDominatorTreeWrapperPassPass(Registry); initializeMachineRegionInfoPassPass(Registry); - initializeMachineSchedulerLegacyPass(Registry); + initializeMachineSchedulerPass(Registry); initializeMachineSinkingPass(Registry); initializeMachineUniformityAnalysisPassPass(Registry); initializeMachineUniformityInfoPrinterPassPass(Registry); @@ -105,7 +105,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializePHIEliminationPass(Registry); initializePatchableFunctionPass(Registry); initializePeepholeOptimizerLegacyPass(Registry); - initializePostMachineSchedulerLegacyPass(Registry); + initializePostMachineSchedulerPass(Registry); initializePostRAHazardRecognizerPass(Registry); initializePostRAMachineSinkingPass(Registry); initializePostRASchedulerLegacyPass(Registry); diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index df90077b15f33..3f72e8486c06e 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -216,85 +216,67 @@ MachineSchedContext::~MachineSchedContext() { namespace { -/// Base class for the machine scheduler classes. -class MachineSchedulerBase : public MachineSchedContext { -protected: - void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags); -}; - -/// Impl class for MachineScheduler. -class MachineSchedulerImpl : public MachineSchedulerBase { - MachineFunctionPass *P = nullptr; - MachineFunctionAnalysisManager *MFAM = nullptr; - -public: - MachineSchedulerImpl(MachineFunction &Func, MachineFunctionPass *P); - MachineSchedulerImpl(MachineFunction &Func, - MachineFunctionAnalysisManager &MFAM, - const TargetMachine *TargetM); - bool run(); - -protected: - ScheduleDAGInstrs *createMachineScheduler(); -}; - -/// Impl class for PostMachineScheduler. -class PostMachineSchedulerImpl : public MachineSchedulerBase { - MachineFunctionPass *P = nullptr; - MachineFunctionAnalysisManager *MFAM = nullptr; - +/// Base class for a machine scheduler class that can run at any point. +class MachineSchedulerBase : public MachineSchedContext, + public MachineFunctionPass { public: - PostMachineSchedulerImpl(MachineFunction &Func, MachineFunctionPass *P); - PostMachineSchedulerImpl(MachineFunction &Func, - MachineFunctionAnalysisManager &MFAM, - const TargetMachine *TargetM); - bool run(); + MachineSchedulerBase(char &ID) : MachineFunctionPass(ID) {} protected: - ScheduleDAGInstrs *createPostMachineScheduler(); + void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags); }; /// MachineScheduler runs after coalescing and before register allocation. -class MachineSchedulerLegacy : public MachineFunctionPass { +class MachineScheduler : public MachineSchedulerBase { public: - MachineSchedulerLegacy(); + MachineScheduler(); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction&) override; static char ID; // Class identification, replacement for typeinfo + +protected: + ScheduleDAGInstrs *createMachineScheduler(); }; /// PostMachineScheduler runs after shortly before code emission. -class PostMachineSchedulerLegacy : public MachineFunctionPass { +class PostMachineScheduler : public MachineSchedulerBase { public: - PostMachineSchedulerLegacy(); + PostMachineScheduler(); + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction&) override; static char ID; // Class identification, replacement for typeinfo + +protected: + ScheduleDAGInstrs *createPostMachineScheduler(); }; } // end anonymous namespace -char MachineSchedulerLegacy::ID = 0; +char MachineScheduler::ID = 0; -char &llvm::MachineSchedulerID = MachineSchedulerLegacy::ID; +char &llvm::MachineSchedulerID = MachineScheduler::ID; -INITIALIZE_PASS_BEGIN(MachineSchedulerLegacy, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) -INITIALIZE_PASS_END(MachineSchedulerLegacy, DEBUG_TYPE, +INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE, "Machine Instruction Scheduler", false, false) -MachineSchedulerLegacy::MachineSchedulerLegacy() : MachineFunctionPass(ID) { - initializeMachineSchedulerLegacyPass(*PassRegistry::getPassRegistry()); +MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) { + initializeMachineSchedulerPass(*PassRegistry::getPassRegistry()); } -void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { +void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); @@ -307,24 +289,23 @@ void MachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -char PostMachineSchedulerLegacy::ID = 0; +char PostMachineScheduler::ID = 0; -char &llvm::PostMachineSchedulerID = PostMachineSchedulerLegacy::ID; +char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID; -INITIALIZE_PASS_BEGIN(PostMachineSchedulerLegacy, "postmisched", +INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched", "PostRA Machine Instruction Scheduler", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(PostMachineSchedulerLegacy, "postmisched", +INITIALIZE_PASS_END(PostMachineScheduler, "postmisched", "PostRA Machine Instruction Scheduler", false, false) -PostMachineSchedulerLegacy::PostMachineSchedulerLegacy() - : MachineFunctionPass(ID) { - initializePostMachineSchedulerLegacyPass(*PassRegistry::getPassRegistry()); +PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) { + initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry()); } -void PostMachineSchedulerLegacy::getAnalysisUsage(AnalysisUsage &AU) const { +void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); @@ -403,40 +384,18 @@ nextIfDebug(MachineBasicBlock::iterator I, .getNonConstIterator(); } -MachineSchedulerImpl::MachineSchedulerImpl(MachineFunction &Func, - MachineFunctionPass *P) - : P(P) { - MF = &Func; - MLI = &P->getAnalysis().getLI(); - MDT = &P->getAnalysis().getDomTree(); - TM = &P->getAnalysis().getTM(); - AA = &P->getAnalysis().getAAResults(); - LIS = &P->getAnalysis().getLIS(); -} - -MachineSchedulerImpl::MachineSchedulerImpl(MachineFunction &Func, - MachineFunctionAnalysisManager &MFAM, - const TargetMachine *TargetM) - : MFAM(&MFAM) { - MF = &Func; - TM = TargetM; - MLI = &MFAM.getResult(Func); - MDT = &MFAM.getResult(Func); - auto &FAM = MFAM.getResult(Func) - .getManager(); - AA = &FAM.getResult(Func.getFunction()); - LIS = &MFAM.getResult(Func); -} - /// Instantiate a ScheduleDAGInstrs that will be owned by the caller. -ScheduleDAGInstrs *MachineSchedulerImpl::createMachineScheduler() { +ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() { // Select the scheduler, or set the default. MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt; if (Ctor != useDefaultMachineSched) return Ctor(this); + const TargetMachine &TM = + getAnalysis().getTM(); + // Get the default scheduler set by the target for this function. - ScheduleDAGInstrs *Scheduler = TM->createMachineScheduler(this); + ScheduleDAGInstrs *Scheduler = TM.createMachineScheduler(this); if (Scheduler) return Scheduler; @@ -444,60 +403,14 @@ ScheduleDAGInstrs *MachineSchedulerImpl::createMachineScheduler() { return createGenericSchedLive(this); } -bool MachineSchedulerImpl::run() { - if (VerifyScheduling) { - LLVM_DEBUG(LIS->dump()); - const char *MSchedBanner = "Before machine scheduling."; - if (P) - MF->verify(P, MSchedBanner, &errs()); - else - MF->verify(*MFAM, MSchedBanner, &errs()); - } - RegClassInfo->runOnMachineFunction(*MF); - - // Instantiate the selected scheduler for this target, function, and - // optimization level. - std::unique_ptr Scheduler(createMachineScheduler()); - scheduleRegions(*Scheduler, false); - - LLVM_DEBUG(LIS->dump()); - if (VerifyScheduling) { - const char *MSchedBanner = "After machine scheduling."; - if (P) - MF->verify(P, MSchedBanner, &errs()); - else - MF->verify(*MFAM, MSchedBanner, &errs()); - } - return true; -} - -PostMachineSchedulerImpl::PostMachineSchedulerImpl(MachineFunction &Func, - MachineFunctionPass *P) - : P(P) { - MF = &Func; - MLI = &P->getAnalysis().getLI(); - TM = &P->getAnalysis().getTM(); - AA = &P->getAnalysis().getAAResults(); -} - -PostMachineSchedulerImpl::PostMachineSchedulerImpl( - MachineFunction &Func, MachineFunctionAnalysisManager &MFAM, - const TargetMachine *TargetM) - : MFAM(&MFAM) { - MF = &Func; - TM = TargetM; - MLI = &MFAM.getResult(Func); - auto &FAM = MFAM.getResult(Func) - .getManager(); - AA = &FAM.getResult(Func.getFunction()); -} - /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by /// the caller. We don't have a command line option to override the postRA /// scheduler. The Target must configure it. -ScheduleDAGInstrs *PostMachineSchedulerImpl::createPostMachineScheduler() { +ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() { + const TargetMachine &TM = + getAnalysis().getTM(); // Get the postRA scheduler set by the target for this function. - ScheduleDAGInstrs *Scheduler = TM->createPostMachineScheduler(this); + ScheduleDAGInstrs *Scheduler = TM.createPostMachineScheduler(this); if (Scheduler) return Scheduler; @@ -505,30 +418,6 @@ ScheduleDAGInstrs *PostMachineSchedulerImpl::createPostMachineScheduler() { return createGenericSchedPostRA(this); } -bool PostMachineSchedulerImpl::run() { - if (VerifyScheduling) { - const char *PostMSchedBanner = "Before post machine scheduling."; - if (P) - MF->verify(P, PostMSchedBanner, &errs()); - else - MF->verify(*MFAM, PostMSchedBanner, &errs()); - } - - // Instantiate the selected scheduler for this target, function, and - // optimization level. - std::unique_ptr Scheduler(createPostMachineScheduler()); - scheduleRegions(*Scheduler, true); - - if (VerifyScheduling) { - const char *PostMSchedBanner = "After post machine scheduling."; - if (P) - MF->verify(P, PostMSchedBanner, &errs()); - else - MF->verify(*MFAM, PostMSchedBanner, &errs()); - } - return true; -} - /// Top-level MachineScheduler pass driver. /// /// Visit blocks in function order. Divide each block into scheduling regions @@ -545,84 +434,72 @@ bool PostMachineSchedulerImpl::run() { /// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler /// design would be to split blocks at scheduling boundaries, but LLVM has a /// general bias against block splitting purely for implementation simplicity. -bool MachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) +bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(mf.getFunction())) return false; if (EnableMachineSched.getNumOccurrences()) { if (!EnableMachineSched) return false; - } else if (!MF.getSubtarget().enableMachineScheduler()) { + } else if (!mf.getSubtarget().enableMachineScheduler()) return false; - } - LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs())); + LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs())); - MachineSchedulerImpl Impl(MF, this); - return Impl.run(); -} + // Initialize the context of the pass. + MF = &mf; + MLI = &getAnalysis().getLI(); + MDT = &getAnalysis().getDomTree(); + AA = &getAnalysis().getAAResults(); -PreservedAnalyses -MachineSchedulerPass::run(MachineFunction &MF, - MachineFunctionAnalysisManager &MFAM) { - if (EnableMachineSched.getNumOccurrences()) { - if (!EnableMachineSched) - return PreservedAnalyses::all(); - } else if (!MF.getSubtarget().enableMachineScheduler()) { - return PreservedAnalyses::all(); - } + LIS = &getAnalysis().getLIS(); - LLVM_DEBUG(dbgs() << "Before MISched:\n"; MF.print(dbgs())); + if (VerifyScheduling) { + LLVM_DEBUG(LIS->dump()); + MF->verify(this, "Before machine scheduling.", &errs()); + } + RegClassInfo->runOnMachineFunction(*MF); - MachineSchedulerImpl Impl(MF, MFAM, TM); - bool Changed = Impl.run(); - if (!Changed) - return PreservedAnalyses::all(); + // Instantiate the selected scheduler for this target, function, and + // optimization level. + std::unique_ptr Scheduler(createMachineScheduler()); + scheduleRegions(*Scheduler, false); - PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); - PA.preserveSet(); - PA.preserve(); - PA.preserve(); - return PA; + LLVM_DEBUG(LIS->dump()); + if (VerifyScheduling) + MF->verify(this, "After machine scheduling.", &errs()); + return true; } -bool PostMachineSchedulerLegacy::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) +bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(mf.getFunction())) return false; if (EnablePostRAMachineSched.getNumOccurrences()) { if (!EnablePostRAMachineSched) return false; - } else if (!MF.getSubtarget().enablePostRAMachineScheduler()) { + } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) { LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n"); return false; } - LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; MF.print(dbgs())); + LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs())); - PostMachineSchedulerImpl Impl(MF, this); - return Impl.run(); -} + // Initialize the context of the pass. + MF = &mf; + MLI = &getAnalysis().getLI(); + AA = &getAnalysis().getAAResults(); -PreservedAnalyses -PostMachineSchedulerPass::run(MachineFunction &MF, - MachineFunctionAnalysisManager &MFAM) { - if (EnablePostRAMachineSched.getNumOccurrences()) { - if (!EnablePostRAMachineSched) - return PreservedAnalyses::all(); - } else if (!MF.getSubtarget().enablePostRAMachineScheduler()) { - LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n"); - return PreservedAnalyses::all(); - } - LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; MF.print(dbgs())); + if (VerifyScheduling) + MF->verify(this, "Before post machine scheduling.", &errs()); - PostMachineSchedulerImpl Impl(MF, MFAM, TM); - bool Changed = Impl.run(); - if (!Changed) - return PreservedAnalyses::all(); + // Instantiate the selected scheduler for this target, function, and + // optimization level. + std::unique_ptr Scheduler(createPostMachineScheduler()); + scheduleRegions(*Scheduler, true); - PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); - PA.preserveSet(); - return PA; + if (VerifyScheduling) + MF->verify(this, "After post machine scheduling.", &errs()); + return true; } /// Return true of the given instruction should not be included in a scheduling diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index 51e047b2fa3f0..e1f05406297d2 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -135,7 +135,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegisterCoalescerLegacy) -INITIALIZE_PASS_DEPENDENCY(MachineSchedulerLegacy) +INITIALIZE_PASS_DEPENDENCY(MachineScheduler) INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 2e43ad78e5d9b..465c4e8feffbb 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -155,7 +155,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveDebugVariablesWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(RegisterCoalescerLegacy) -INITIALIZE_PASS_DEPENDENCY(MachineSchedulerLegacy) +INITIALIZE_PASS_DEPENDENCY(MachineScheduler) INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 650d23ac1d5ef..e7ba7213a76fe 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -119,7 +119,6 @@ #include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/MachineVerifier.h" #include "llvm/CodeGen/OptimizePHIs.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index fffd30b26dc1d..c6d36fde9730a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -69,7 +69,6 @@ #include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineCSE.h" #include "llvm/CodeGen/MachineLICM.h" -#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -1932,7 +1931,6 @@ AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder( GCNTargetMachine &TM, const CGPassBuilderOption &Opts, PassInstrumentationCallbacks *PIC) : CodeGenPassBuilder(TM, Opts, PIC) { - Opt.MISchedPostRA = true; Opt.RequiresCodeGenSCCOrder = true; // Exceptions and StackMaps are not supported, so these passes will never do // anything. diff --git a/llvm/test/CodeGen/AArch64/a55-fuse-address.mir b/llvm/test/CodeGen/AArch64/a55-fuse-address.mir index 3e1b6076f0167..4edff043a7b3e 100644 --- a/llvm/test/CodeGen/AArch64/a55-fuse-address.mir +++ b/llvm/test/CodeGen/AArch64/a55-fuse-address.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -o - %s -mtriple=aarch64 -run-pass=machine-scheduler -verify-machineinstrs | FileCheck %s -# RUN: llc -o - %s -mtriple=aarch64 -passes=machine-scheduler | FileCheck %s --- | target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64" diff --git a/llvm/test/CodeGen/AArch64/ampere1-sched-add.mir b/llvm/test/CodeGen/AArch64/ampere1-sched-add.mir index 3a33291cbf8e0..e578b5d7f04f3 100644 --- a/llvm/test/CodeGen/AArch64/ampere1-sched-add.mir +++ b/llvm/test/CodeGen/AArch64/ampere1-sched-add.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -run-pass=machine-scheduler %s -o - | FileCheck %s -# RUN: llc -passes=machine-scheduler %s -o - | FileCheck %s --- | target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/CodeGen/AArch64/cluster-frame-index.mir b/llvm/test/CodeGen/AArch64/cluster-frame-index.mir index 5d761f10be3b2..37ab9418f4dbd 100644 --- a/llvm/test/CodeGen/AArch64/cluster-frame-index.mir +++ b/llvm/test/CodeGen/AArch64/cluster-frame-index.mir @@ -1,5 +1,4 @@ #RUN: llc -mtriple=aarch64-- -mcpu=cyclone -run-pass machine-scheduler -o - %s | FileCheck %s -#RUN: llc -mtriple=aarch64-- -mcpu=cyclone -passes=machine-scheduler -o - %s | FileCheck %s --- name: merge_stack # CHECK-LABEL: name: merge_stack diff --git a/llvm/test/CodeGen/AArch64/dump-reserved-cycles.mir b/llvm/test/CodeGen/AArch64/dump-reserved-cycles.mir index 5655bfa5d2945..4bf8afff90d4c 100644 --- a/llvm/test/CodeGen/AArch64/dump-reserved-cycles.mir +++ b/llvm/test/CodeGen/AArch64/dump-reserved-cycles.mir @@ -1,15 +1,9 @@ # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 -misched-dump-reserved-cycles=true \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 -misched-dump-reserved-cycles=true \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s - # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 -misched-dump-reserved-cycles=false\ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s --check-prefix=NODUMP -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 -misched-dump-reserved-cycles=false\ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s --check-prefix=NODUMP - # REQUIRES: asserts --- name: f diff --git a/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir b/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir index c90d6bd3cb420..bff6d1d71b7c4 100644 --- a/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir +++ b/llvm/test/CodeGen/AArch64/dump-schedule-trace.mir @@ -4,34 +4,17 @@ # RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \ # RUN: 2>&1 | FileCheck %s --check-prefix=TOP --strict-whitespace -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ -# RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ -# RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \ -# RUN: 2>&1 | FileCheck %s --check-prefix=TOP --strict-whitespace - # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \ # RUN: -misched-prera-direction=bottomup -sched-print-cycles=true \ # RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \ # RUN: 2>&1 | FileCheck %s --check-prefix=BOTTOM --strict-whitespace -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ -# RUN: -misched-prera-direction=bottomup -sched-print-cycles=true \ -# RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \ -# RUN: 2>&1 | FileCheck %s --check-prefix=BOTTOM --strict-whitespace - # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \ # RUN: -sched-print-cycles=true -misched-dump-schedule-trace=true \ # RUN: 2>&1 | FileCheck %s --check-prefix=BIDIRECTIONAL -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ -# RUN: -sched-print-cycles=true -misched-dump-schedule-trace=true \ -# RUN: 2>&1 | FileCheck %s --check-prefix=BIDIRECTIONAL - # REQUIRES: asserts, aarch64-registered-target --- name: f diff --git a/llvm/test/CodeGen/AArch64/force-enable-intervals.mir b/llvm/test/CodeGen/AArch64/force-enable-intervals.mir index 8d47eee1c8e19..a53d4e7480307 100644 --- a/llvm/test/CodeGen/AArch64/force-enable-intervals.mir +++ b/llvm/test/CodeGen/AArch64/force-enable-intervals.mir @@ -3,21 +3,11 @@ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler \ # RUN: -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ -# RUN: -misched-dump-reserved-cycles=true \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler \ -# RUN: -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s - # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ # RUN: -misched-dump-reserved-cycles=true -sched-model-force-enable-intervals=true \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler \ # RUN: -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s --check-prefix=FORCE -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ -# RUN: -misched-dump-reserved-cycles=true -sched-model-force-enable-intervals=true \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler \ -# RUN: -o - %s 2>&1 -misched-prera-direction=topdown | FileCheck %s --check-prefix=FORCE - # REQUIRES: asserts, aarch64-registered-target --- name: f diff --git a/llvm/test/CodeGen/AArch64/machine-scheduler.mir b/llvm/test/CodeGen/AArch64/machine-scheduler.mir index ba2c2b33d8e92..6c0222f4fdd78 100644 --- a/llvm/test/CodeGen/AArch64/machine-scheduler.mir +++ b/llvm/test/CodeGen/AArch64/machine-scheduler.mir @@ -1,5 +1,4 @@ # RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass machine-scheduler -verify-machineinstrs -o - %s | FileCheck %s -# RUN: llc -mtriple=aarch64-none-linux-gnu -passes=machine-scheduler -o - %s | FileCheck %s --- | define i64 @load_imp-def(ptr nocapture %P, i32 %v) { diff --git a/llvm/test/CodeGen/AArch64/macro-fusion-addsub-2reg-const1.mir b/llvm/test/CodeGen/AArch64/macro-fusion-addsub-2reg-const1.mir index 2f0d19fec07d9..8c5a85a4e7a61 100644 --- a/llvm/test/CodeGen/AArch64/macro-fusion-addsub-2reg-const1.mir +++ b/llvm/test/CodeGen/AArch64/macro-fusion-addsub-2reg-const1.mir @@ -1,7 +1,5 @@ # RUN: llc -o - %s -mtriple=aarch64-- -mattr=+fuse-addsub-2reg-const1 -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,FUSION -# RUN: llc -o - %s -mtriple=aarch64-- -mattr=+fuse-addsub-2reg-const1 -passes=postmisched | FileCheck %s --check-prefixes=CHECK,FUSION # RUN: llc -o - %s -mtriple=aarch64-- -mattr=-fuse-addsub-2reg-const1 -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION -# RUN: llc -o - %s -mtriple=aarch64-- -mattr=-fuse-addsub-2reg-const1 -passes=postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION --- # CHECK-LABEL: name: addsub2reg # CHECK: $w8 = ADDWrr killed renamable $w0, killed renamable $w1 diff --git a/llvm/test/CodeGen/AArch64/macro-fusion-last.mir b/llvm/test/CodeGen/AArch64/macro-fusion-last.mir index affd2bb039e96..14937a4794e96 100644 --- a/llvm/test/CodeGen/AArch64/macro-fusion-last.mir +++ b/llvm/test/CodeGen/AArch64/macro-fusion-last.mir @@ -1,7 +1,5 @@ # RUN: llc -o - %s -mtriple=aarch64-- -mattr=+arith-bcc-fusion -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,FUSION -# RUN: llc -o - %s -mtriple=aarch64-- -mattr=+arith-bcc-fusion -passes=postmisched | FileCheck %s --check-prefixes=CHECK,FUSION # RUN: llc -o - %s -mtriple=aarch64-- -mattr=-arith-bcc-fusion -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION -# RUN: llc -o - %s -mtriple=aarch64-- -mattr=-arith-bcc-fusion -passes=postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION # Make sure the last instruction is correctly macro-fused when scheduling # top-down (post-ra). --- diff --git a/llvm/test/CodeGen/AArch64/misched-branch-targets.mir b/llvm/test/CodeGen/AArch64/misched-branch-targets.mir index 954082631bdbf..40f148438e537 100644 --- a/llvm/test/CodeGen/AArch64/misched-branch-targets.mir +++ b/llvm/test/CodeGen/AArch64/misched-branch-targets.mir @@ -1,9 +1,6 @@ # RUN: llc -o - -run-pass=machine-scheduler -misched=shuffle %s | FileCheck %s # RUN: llc -o - -run-pass=postmisched %s | FileCheck %s -# RUN: llc -o - -passes=machine-scheduler -misched=shuffle %s | FileCheck %s -# RUN: llc -o - -passes=postmisched %s | FileCheck %s - # REQUIRES: asserts # -misched=shuffle is only available with assertions enabled diff --git a/llvm/test/CodeGen/AArch64/misched-bundle.mir b/llvm/test/CodeGen/AArch64/misched-bundle.mir index 8463cb038a3bc..ac6112e8c60ef 100644 --- a/llvm/test/CodeGen/AArch64/misched-bundle.mir +++ b/llvm/test/CodeGen/AArch64/misched-bundle.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a510 -run-pass=machine-scheduler -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a510 -passes=machine-scheduler -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s # REQUIRES: asserts # CHECK: SU(0): renamable $z0 = LD1H renamable $p0, renamable $x1, renamable $x10 :: (load unknown-size, align 1) diff --git a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir index ca92fa14a3fa8..ea40f9e52dcd6 100644 --- a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir +++ b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir @@ -6,14 +6,6 @@ # RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \ # RUN: | FileCheck %s -# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -mcpu=cortex-a55 %s -o - 2>&1 \ -# RUN: -misched-dump-reserved-cycles=true \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler \ -# RUN: -misched-prera-direction=bottomup -sched-print-cycles=true \ -# RUN: -misched-detail-resource-booking=true \ -# RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-header-width=21 \ -# RUN: | FileCheck %s - # REQUIRES: asserts, aarch64-registered-target --- | diff --git a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir index 2b34ca54f1e97..9be91b8a01e86 100644 --- a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir +++ b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-02.mir @@ -5,13 +5,6 @@ # RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \ # RUN: 2>&1 | FileCheck %s -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a55 \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ -# RUN: -misched-prera-direction=bottomup -sched-print-cycles=true \ -# RUN: -misched-dump-reserved-cycles=true -misched-detail-resource-booking=true\ -# RUN: -misched-dump-schedule-trace=true -misched-dump-schedule-trace-col-width=4 \ -# RUN: 2>&1 | FileCheck %s - # REQUIRES: asserts, aarch64-registered-target --- name: f diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-arith-logic.mir b/llvm/test/CodeGen/AArch64/misched-fusion-arith-logic.mir index 60c0026d39466..62276779d1423 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-arith-logic.mir +++ b/llvm/test/CodeGen/AArch64/misched-fusion-arith-logic.mir @@ -1,7 +1,5 @@ # RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mattr=fuse-arith-logic -run-pass=machine-scheduler -misched-print-dags | FileCheck %s -# RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mattr=fuse-arith-logic -passes=machine-scheduler -misched-print-dags | FileCheck %s # RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mcpu=exynos-m4 -run-pass=machine-scheduler -misched-print-dags | FileCheck %s -# RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mcpu=exynos-m4 -passes=machine-scheduler -misched-print-dags | FileCheck %s # REQUIRES: asserts --- diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-cmp.mir b/llvm/test/CodeGen/AArch64/misched-fusion-cmp.mir index 82498164c6ad5..b0450c5b8c01b 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-cmp.mir +++ b/llvm/test/CodeGen/AArch64/misched-fusion-cmp.mir @@ -1,5 +1,4 @@ # RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mcpu=cortex-x1 -run-pass=machine-scheduler -# RUN: llc -o /dev/null 2>&1 %s -mtriple aarch64-unknown -mcpu=cortex-x1 -passes=machine-scheduler # Just ensure this doesn't crash. --- diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-crypto-eor.mir b/llvm/test/CodeGen/AArch64/misched-fusion-crypto-eor.mir index e661353615726..623a8221f5ed2 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-crypto-eor.mir +++ b/llvm/test/CodeGen/AArch64/misched-fusion-crypto-eor.mir @@ -1,9 +1,6 @@ # RUN: llc -o /dev/null %s -run-pass=machine-scheduler -mtriple aarch64-- -mattr=-fuse-aes,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,NOFUSE # RUN: llc -o /dev/null %s -run-pass=machine-scheduler -mtriple aarch64-- -mattr=+fuse-aes,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,FUSEAES # RUN: llc -o /dev/null %s -run-pass=machine-scheduler -mtriple aarch64-- -mattr=+fuse-aes,+fuse-crypto-eor,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,FUSEAES,FUSECRYPTO -# RUN: llc -o /dev/null %s -passes=machine-scheduler -mtriple aarch64-- -mattr=-fuse-aes,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,NOFUSE -# RUN: llc -o /dev/null %s -passes=machine-scheduler -mtriple aarch64-- -mattr=+fuse-aes,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,FUSEAES -# RUN: llc -o /dev/null %s -passes=machine-scheduler -mtriple aarch64-- -mattr=+fuse-aes,+fuse-crypto-eor,+crypto -misched-print-dags 2>&1 | FileCheck %s --check-prefixes=CHECK,FUSEAES,FUSECRYPTO # REQUIRES: asserts name: func diff --git a/llvm/test/CodeGen/AArch64/misched-move-imm.mir b/llvm/test/CodeGen/AArch64/misched-move-imm.mir index 65608bb5f1a1c..b5ff01b3c5b13 100644 --- a/llvm/test/CodeGen/AArch64/misched-move-imm.mir +++ b/llvm/test/CodeGen/AArch64/misched-move-imm.mir @@ -1,5 +1,4 @@ # RUN: llc -run-pass=machine-scheduler -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 %s -o /dev/null 2>&1 -# RUN: llc -passes=machine-scheduler -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 %s -o /dev/null 2>&1 # Just ensure this doesn't crash. Ensures in the neoverse-v2 # scheduling model we don't attempt to treat the first input # operand of MOVZXi as an immediate operand. diff --git a/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir b/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir index 17a6cf7e6faa9..0b14ceeef9a09 100644 --- a/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir +++ b/llvm/test/CodeGen/AArch64/misched-predicate-virtreg.mir @@ -1,5 +1,4 @@ # RUN: llc -mcpu=exynos-m5 -mtriple=aarch64 -enable-misched -run-pass=machine-scheduler -debug-only=machine-scheduler %s -o /dev/null 2>&1 | FileCheck %s -# RUN: llc -mcpu=exynos-m5 -mtriple=aarch64 -enable-misched -passes=machine-scheduler -debug-only=machine-scheduler %s -o /dev/null 2>&1 | FileCheck %s # REQUIRES: asserts # CHECK-LABEL: ********** MI Scheduling ********** diff --git a/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir b/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir index b652d2463fc12..b04fd89b796ba 100644 --- a/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir +++ b/llvm/test/CodeGen/AArch64/misched-sort-resource-in-trace.mir @@ -3,21 +3,11 @@ # RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ # RUN: -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=true 2>&1 | FileCheck --check-prefix=SORTED %s -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -verify-machineinstrs \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ -# RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ -# RUN: -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=true 2>&1 | FileCheck --check-prefix=SORTED %s - # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -verify-machineinstrs \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s \ # RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ # RUN: -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=false 2>&1 | FileCheck --check-prefix=UNSORTED %s -# RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -verify-machineinstrs \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s \ -# RUN: -misched-prera-direction=topdown -sched-print-cycles=true \ -# RUN: -misched-dump-schedule-trace=true --misched-sort-resources-in-trace=false 2>&1 | FileCheck --check-prefix=UNSORTED %s - # REQUIRES: asserts, aarch64-registered-target --- name: test diff --git a/llvm/test/CodeGen/AArch64/sched-postidxalias.mir b/llvm/test/CodeGen/AArch64/sched-postidxalias.mir index 02256ca30d842..98ee0fa21b2dd 100644 --- a/llvm/test/CodeGen/AArch64/sched-postidxalias.mir +++ b/llvm/test/CodeGen/AArch64/sched-postidxalias.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=aarch64 -mcpu=cortex-a55 -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s -# RUN: llc -mtriple=aarch64 -mcpu=cortex-a55 -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s # REQUIRES: asserts # Both the accesses should have an offset of 0 diff --git a/llvm/test/CodeGen/AArch64/sched-print-cycle.mir b/llvm/test/CodeGen/AArch64/sched-print-cycle.mir index d58037e987773..59c51571df74b 100644 --- a/llvm/test/CodeGen/AArch64/sched-print-cycle.mir +++ b/llvm/test/CodeGen/AArch64/sched-print-cycle.mir @@ -1,15 +1,9 @@ # RUN: llc -mtriple=arm64-apple-macos -mcpu=apple-m1 -sched-print-cycles=true \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s -# RUN: llc -mtriple=arm64-apple-macos -mcpu=apple-m1 -sched-print-cycles=true \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s - # RUN: llc -mtriple=arm64-apple-macos -mcpu=apple-m1 -sched-print-cycles=false \ # RUN: -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s --check-prefix=NOCYCLES -# RUN: llc -mtriple=arm64-apple-macos -mcpu=apple-m1 -sched-print-cycles=false \ -# RUN: -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s --check-prefix=NOCYCLES - # REQUIRES: asserts --- name: mul_mul diff --git a/llvm/test/CodeGen/AArch64/scheduledag-constreg.mir b/llvm/test/CodeGen/AArch64/scheduledag-constreg.mir index 66680af3f856b..65ec43407413f 100644 --- a/llvm/test/CodeGen/AArch64/scheduledag-constreg.mir +++ b/llvm/test/CodeGen/AArch64/scheduledag-constreg.mir @@ -1,5 +1,4 @@ # RUN: llc -o /dev/null %s -mtriple=aarch64-- -run-pass=machine-scheduler -enable-misched -debug-only=machine-scheduler 2>&1 | FileCheck %s -# RUN: llc -o /dev/null %s -mtriple=aarch64-- -passes=machine-scheduler -enable-misched -debug-only=machine-scheduler 2>&1 | FileCheck %s # REQUIRES: asserts --- | define void @func() { ret void } diff --git a/llvm/test/CodeGen/AArch64/sve-aliasing.mir b/llvm/test/CodeGen/AArch64/sve-aliasing.mir index 34a08adc417cf..3b7c9fefa5277 100644 --- a/llvm/test/CodeGen/AArch64/sve-aliasing.mir +++ b/llvm/test/CodeGen/AArch64/sve-aliasing.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -o - %s -mtriple=aarch64 -run-pass=machine-scheduler -verify-machineinstrs | FileCheck %s -# RUN: llc -o - %s -mtriple=aarch64 -passes=machine-scheduler | FileCheck %s --- name: scalable_v16i1 diff --git a/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir b/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir index 1c4093b2feb9b..82ee173e12256 100644 --- a/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/at-least-one-def-value-assert.mir @@ -1,7 +1,5 @@ # RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o /dev/null %s 2>&1 | FileCheck %s -# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=machine-scheduler -verify-misched -o /dev/null %s 2>&1 | FileCheck %s - # CHECK: *** Bad machine code: No live subrange at use *** # CHECK-NEXT: - function: at_least_one_value_should_be_defined_by_this_mask # CHECK-NEXT: - basic block: %bb.0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir index 1ae544f3c074a..0d84dc0bdc53e 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir +++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir @@ -1,5 +1,4 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass machine-scheduler %s -o - | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=machine-scheduler %s -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: cluster_flat_loads # GCN: FLAT_LOAD_DWORD %0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir index b38dc4d21c10c..4945c7020ca18 100644 --- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir +++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=machine-scheduler -o - %s | FileCheck %s # The DBG_VALUE in bb.5 ends a scheduling region, and its uses should # not be tracked like a normal instruction. diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir index 156979d6d06a5..8a1c68b3f6615 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=machine-scheduler %s -o - | FileCheck %s --- | declare void @llvm.dbg.value(metadata, metadata, metadata) #0 diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir index d415346b49b28..19071be7ebde4 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir @@ -1,5 +1,4 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s # REQUIRES: asserts # CHECK: ********** MI Scheduling ********** diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir index 170672dc4af64..4f15e0ef68977 100644 --- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler.mir @@ -1,5 +1,4 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s # REQUIRES: asserts # CHECK: All regions recorded, starting actual scheduling. diff --git a/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir b/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir index 204912b4d4881..962d49df8509e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir +++ b/llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir @@ -1,5 +1,4 @@ # RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=tonga -passes=machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: flat_load_clustering # GCN: FLAT_LOAD_DWORD diff --git a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir index 78f21ef6610f2..d57450baea911 100644 --- a/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir +++ b/llvm/test/CodeGen/AMDGPU/high-RP-reschedule.mir @@ -1,8 +1,6 @@ # REQUIRES: asserts -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -passes=machine-scheduler -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -passes=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-misched -run-pass=machine-scheduler -amdgpu-use-amdgpu-trackers=1 -verify-misched -debug-only=machine-scheduler -o - %s 2>&1 | FileCheck -check-prefix=GCN-GCNTRACKER %s --- | define amdgpu_kernel void @high-RP-reschedule() { ret void } diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir index 5dc6d2ee8f695..e32de1e42aac4 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir @@ -1,5 +1,4 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck -check-prefix=DEBUG %s # REQUIRES: asserts --- | diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir index 9991cb1837e01..fb65d80c46e06 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir +++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -amdgpu-disable-unclustered-high-rp-reschedule %s -o - | FileCheck -check-prefix=GFX908 %s --- name: test_occ_10_max_occ_no_sink diff --git a/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir b/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir index ffc86dc5eee6f..2aa430400e49a 100644 --- a/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir +++ b/llvm/test/CodeGen/AMDGPU/macro-fusion-cluster-vcc-uses.mir @@ -1,5 +1,4 @@ # RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=tahiti -passes=machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: cluster_add_addc # GCN: S_NOP 0, implicit-def $vcc diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index c90975959c3f4..c933fb0de5864 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=machine-scheduler -verify-misched -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s # This would assert that a dead def should have no uses, but the dead # def and use have different subreg indices. diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir index 2cd78062ccbd7..add7825a224ed 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machine-scheduler -verify-machineinstrs %s -o - | FileCheck %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=machine-scheduler %s -o - | FileCheck %s # The sequence of DBG_VALUEs forms a scheduling region with 0 real # instructions. The RegPressure tracker would end up skipping over any diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir index f797b01d49bf8..3fdb0c7c0885b 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-hang-weak-dep.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -passes=machine-scheduler -verify-misched -o - %s | FileCheck %s # This would hang after removing edges from the SCHED_BARRIER since the number # of Preds/Succs would be left in an inconsistent state. diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index 3254f5e45e4f4..09037709d51d8 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -1,5 +1,4 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=machine-scheduler -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -passes=machine-scheduler -o - %s | FileCheck %s --- | %struct.widget.0 = type { float, i32, i32 } diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index 3ca61d26e8e42..6796391aba675 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -verify-misched -run-pass=machine-scheduler -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-misched -passes=machine-scheduler -o - %s | FileCheck %s --- name: handleMoveUp_incorrect_interval diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir index 099cfc4f1dd54..0b1fd441256d8 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir @@ -1,8 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=machine-scheduler -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=machine-scheduler -o - %s | FileCheck %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=machine-scheduler -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=machine-scheduler -o - %s | FileCheck %s # Make sure FP mode is not a hard scheduling boundary --- diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir index 88e11c9ce3d1d..e67036f0bbbea 100644 --- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=machine-scheduler -o - %s | FileCheck %s --- # Check that the high latency loads are both scheduled first, before the diff --git a/llvm/test/CodeGen/AMDGPU/sreg-xnull-regclass-bitwidth.mir b/llvm/test/CodeGen/AMDGPU/sreg-xnull-regclass-bitwidth.mir index 3091fe85fa8bc..d8d4f5d0220c9 100644 --- a/llvm/test/CodeGen/AMDGPU/sreg-xnull-regclass-bitwidth.mir +++ b/llvm/test/CodeGen/AMDGPU/sreg-xnull-regclass-bitwidth.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=postmisched -o - %s | FileCheck %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=postmisched -o - %s | FileCheck %s --- name: test_xnull_256 body: | diff --git a/llvm/test/CodeGen/ARM/cortex-m7-wideops.mir b/llvm/test/CodeGen/ARM/cortex-m7-wideops.mir index 1bee32f4c90cd..0a47b87b422dd 100644 --- a/llvm/test/CodeGen/ARM/cortex-m7-wideops.mir +++ b/llvm/test/CodeGen/ARM/cortex-m7-wideops.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple arm-arm-eabi -mcpu=cortex-m7 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck %s -# RUN: llc -mtriple arm-arm-eabi -mcpu=cortex-m7 -passes=postmisched %s -o - | FileCheck %s --- name: test_groups alignment: 2 diff --git a/llvm/test/CodeGen/ARM/misched-branch-targets.mir b/llvm/test/CodeGen/ARM/misched-branch-targets.mir index 610344f844001..d828d9e516273 100644 --- a/llvm/test/CodeGen/ARM/misched-branch-targets.mir +++ b/llvm/test/CodeGen/ARM/misched-branch-targets.mir @@ -1,7 +1,5 @@ # RUN: llc -o - -run-pass=machine-scheduler -misched=shuffle %s | FileCheck %s -# RUN: llc -o - -passes=machine-scheduler -misched=shuffle %s | FileCheck %s # RUN: llc -o - -run-pass=postmisched %s | FileCheck %s -# RUN: llc -o - -passes=postmisched %s | FileCheck %s # REQUIRES: asserts # -misched=shuffle is only available with assertions enabled diff --git a/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir b/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir index 8bdbe288d98e6..627e553475480 100644 --- a/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir +++ b/llvm/test/CodeGen/PowerPC/topdepthreduce-postra.mir @@ -1,6 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=postmisched -o - %s | FileCheck %s -# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -passes=postmisched -o - %s | FileCheck %s --- # Check that postmisched's TopDepthReduce heuristic moves the MULLD later # because of the dependency on x5 diff --git a/llvm/test/CodeGen/RISCV/misched-postra-direction.mir b/llvm/test/CodeGen/RISCV/misched-postra-direction.mir index e4b934c3036ae..2cca042bebee6 100644 --- a/llvm/test/CodeGen/RISCV/misched-postra-direction.mir +++ b/llvm/test/CodeGen/RISCV/misched-postra-direction.mir @@ -11,19 +11,6 @@ # RUN: -misched-dump-schedule-trace -misched-postra-direction=bidirectional \ # RUN: -o - %s 2>&1 | FileCheck --check-prefix=BIDIRECTIONAL %s -# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -passes=postmisched \ -# RUN: -enable-post-misched -debug-only=machine-scheduler \ -# RUN: -misched-dump-schedule-trace -misched-postra-direction=topdown \ -# RUN: -o - %s 2>&1 | FileCheck --check-prefix=TOPDOWN %s -# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -passes=postmisched \ -# RUN: -enable-post-misched -debug-only=machine-scheduler \ -# RUN: -misched-dump-schedule-trace -misched-postra-direction=bottomup \ -# RUN: -o - %s 2>&1 | FileCheck --check-prefix=BOTTOMUP %s -# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -passes=postmisched \ -# RUN: -enable-post-misched -debug-only=machine-scheduler \ -# RUN: -misched-dump-schedule-trace -misched-postra-direction=bidirectional \ -# RUN: -o - %s 2>&1 | FileCheck --check-prefix=BIDIRECTIONAL %s - # REQUIRES: asserts --- From 16df836a527e4a04d2cbdb52365c81ff80e3e757 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 8 Feb 2025 11:32:22 +0000 Subject: [PATCH 050/293] [VPlan] Mark hasVF & hasScalableVF as const (NFC). --- llvm/lib/Transforms/Vectorize/VPlan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9988e03e9fdca..30c568c3035f6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3669,8 +3669,8 @@ class VPlan { VFs.insert(VF); } - bool hasVF(ElementCount VF) { return VFs.count(VF); } - bool hasScalableVF() { + bool hasVF(ElementCount VF) const { return VFs.count(VF); } + bool hasScalableVF() const { return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); }); } From 0cdb467c7da731bb83abc75480cbf66ad64aa014 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Sat, 8 Feb 2025 11:39:11 +0000 Subject: [PATCH 051/293] [lldb][TypeSystemClang] Create EnumExtensibilityAttr from DW_AT_APPLE_enum_kind (#126221) This patch consumes the `DW_AT_APPLE_enum_kind` attribute added in https://github.com/llvm/llvm-project/pull/124752 and turns it into a Clang attribute in the AST. This will currently be used by the Swift language plugin when it creates `EnumDecl`s from debug-info and passes it to Swift compiler, which expects these attributes --- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 9 +++-- .../SymbolFile/DWARF/DWARFASTParserClang.h | 4 +++ .../TypeSystem/Clang/TypeSystemClang.cpp | 7 +++- .../TypeSystem/Clang/TypeSystemClang.h | 13 ++++---- lldb/test/Shell/Expr/TestEnumExtensibility.m | 33 +++++++++++++++++++ 5 files changed, 57 insertions(+), 9 deletions(-) create mode 100644 lldb/test/Shell/Expr/TestEnumExtensibility.m diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 39296ba5b437f..ec0004c70c6da 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -492,6 +492,10 @@ ParsedDWARFTypeAttributes::ParsedDWARFTypeAttributes(const DWARFDIE &die) { case DW_AT_reference: ref_qual = clang::RQ_LValue; break; + case DW_AT_APPLE_enum_kind: + enum_kind = static_cast( + form_value.Unsigned()); + break; } } } @@ -1001,9 +1005,10 @@ TypeSP DWARFASTParserClang::ParseEnum(const SymbolContext &sc, } CompilerType clang_type = m_ast.CreateEnumerationType( - attrs.name.GetStringRef(), GetClangDeclContextContainingDIE(def_die, nullptr), + attrs.name.GetStringRef(), + GetClangDeclContextContainingDIE(def_die, nullptr), GetOwningClangModule(def_die), attrs.decl, enumerator_clang_type, - attrs.is_scoped_enum); + attrs.is_scoped_enum, attrs.enum_kind); TypeSP type_sp = dwarf->MakeType(def_die.GetID(), attrs.name, attrs.byte_size, nullptr, attrs.type.Reference().GetID(), Type::eEncodingIsUID, diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index 36fb381d3e291..135dd06186c4b 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -568,6 +568,10 @@ struct ParsedDWARFTypeAttributes { ///< Indicates ref-qualifier of C++ member function if present. ///< Is RQ_None otherwise. clang::RefQualifierKind ref_qual = clang::RQ_None; + + ///< Has a value if this DIE represents an enum that was declared + ///< with enum_extensibility. + std::optional enum_kind; }; #endif // LLDB_SOURCE_PLUGINS_SYMBOLFILE_DWARF_DWARFASTPARSERCLANG_H diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index ecb571b1161bb..4901b6029d9ce 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -2303,7 +2303,8 @@ CompilerType TypeSystemClang::GetOrCreateStructForIdentifier( CompilerType TypeSystemClang::CreateEnumerationType( llvm::StringRef name, clang::DeclContext *decl_ctx, OptionalClangModuleID owning_module, const Declaration &decl, - const CompilerType &integer_clang_type, bool is_scoped) { + const CompilerType &integer_clang_type, bool is_scoped, + std::optional enum_kind) { // TODO: Do something intelligent with the Declaration object passed in // like maybe filling in the SourceLocation with it... ASTContext &ast = getASTContext(); @@ -2321,6 +2322,10 @@ CompilerType TypeSystemClang::CreateEnumerationType( if (decl_ctx) decl_ctx->addDecl(enum_decl); + if (enum_kind) + enum_decl->addAttr( + clang::EnumExtensibilityAttr::CreateImplicit(ast, *enum_kind)); + // TODO: check if we should be setting the promotion type too? enum_decl->setIntegerType(ClangUtil::GetQualType(integer_clang_type)); diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h index e70ad4c2973a5..99d9becffd128 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h @@ -22,6 +22,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/ASTFwd.h" +#include "clang/AST/Attr.h" #include "clang/AST/Decl.h" #include "clang/AST/TemplateBase.h" #include "clang/AST/Type.h" @@ -498,12 +499,12 @@ class TypeSystemClang : public TypeSystem { bool is_vector); // Enumeration Types - CompilerType CreateEnumerationType(llvm::StringRef name, - clang::DeclContext *decl_ctx, - OptionalClangModuleID owning_module, - const Declaration &decl, - const CompilerType &integer_qual_type, - bool is_scoped); + CompilerType CreateEnumerationType( + llvm::StringRef name, clang::DeclContext *decl_ctx, + OptionalClangModuleID owning_module, const Declaration &decl, + const CompilerType &integer_qual_type, bool is_scoped, + std::optional enum_kind = + std::nullopt); // Integer type functions diff --git a/lldb/test/Shell/Expr/TestEnumExtensibility.m b/lldb/test/Shell/Expr/TestEnumExtensibility.m new file mode 100644 index 0000000000000..738b4fa2c7786 --- /dev/null +++ b/lldb/test/Shell/Expr/TestEnumExtensibility.m @@ -0,0 +1,33 @@ +// UNSUPPORTED: system-linux, system-windows + +// RUN: %clangxx_host %s -c -g -o %t +// RUN: %lldb %t \ +// RUN: -o "target var gClosed gOpen gNS gNSOpts" \ +// RUN: -o "image dump ast" \ +// RUN: 2>&1 | FileCheck %s + +#import + +enum __attribute__((enum_extensibility(closed))) Closed { C1 } gClosed; + +enum __attribute__((enum_extensibility(open))) Open { O1 } gOpen; + +typedef NS_ENUM(int, NS) { N1 } gNS; + +typedef NS_OPTIONS(int, NSO) { OPT1 } gNSOpts; + +// CHECK: EnumDecl {{.*}} Closed +// CHECK-NEXT: |-EnumExtensibilityAttr {{.*}} Closed +// CHECK-NEXT: `-EnumConstantDecl {{.*}} C1 'Closed' + +// CHECK: EnumDecl {{.*}} Open +// CHECK-NEXT: |-EnumExtensibilityAttr {{.*}} Open +// CHECK-NEXT: `-EnumConstantDecl {{.*}} O1 'Open' + +// CHECK: EnumDecl {{.*}} NS +// CHECK-NEXT: |-EnumExtensibilityAttr {{.*}} Open +// CHECK-NEXT: `-EnumConstantDecl {{.*}} N1 'NS' + +// CHECK: EnumDecl {{.*}} NSO +// CHECK-NEXT: |-EnumExtensibilityAttr {{.*}} Open +// CHECK-NEXT: `-EnumConstantDecl {{.*}} OPT1 'NSO' From ee806646ad893fcb0d19a75cebcc1f0e0bccabf1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 8 Feb 2025 12:19:25 +0000 Subject: [PATCH 052/293] [VPlan] Consistently use hasScalarVFOnly (NFC). Consistently use hasScalarVFOnly instead of using hasVF(ElementCount::getFixed(1)). Also add an assert to ensure all cases are covered by hasScalarVFOnly. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 11 +++++------ llvm/lib/Transforms/Vectorize/VPlan.h | 7 ++++++- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6ad44259ccdf6..4a03d6e9a3ced 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4559,11 +4559,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - assert(any_of(VPlans, - [](std::unique_ptr &P) { - return P->hasVF(ElementCount::getFixed(1)); - }) && - "Expected Scalar VF to be a candidate"); + assert( + any_of(VPlans, + [](std::unique_ptr &P) { return P->hasScalarVFOnly(); }) && + "Expected Scalar VF to be a candidate"); const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, ExpectedCost); @@ -8929,7 +8928,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) { - bool HasScalarVF = Plan->hasVF(ElementCount::getFixed(1)); + bool HasScalarVF = Plan->hasScalarVFOnly(); // Now optimize the initial VPlan. if (!HasScalarVF) VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 30c568c3035f6..3816e1b61576a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3680,7 +3680,12 @@ class VPlan { return {VFs.begin(), VFs.end()}; } - bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); } + bool hasScalarVFOnly() const { + bool HasScalarVFOnly = VFs.size() == 1 && VFs[0].isScalar(); + assert(HasScalarVFOnly == hasVF(ElementCount::getFixed(1)) && + "Plan with scalar VF should only have a single VF"); + return HasScalarVFOnly; + } bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 7e9ef46133936..1380946fd6b4d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -591,7 +591,7 @@ static SmallVector collectUsersRecursively(VPValue *V) { static void legalizeAndOptimizeInductions(VPlan &Plan) { using namespace llvm::VPlanPatternMatch; VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1)); + bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly(); VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); for (VPRecipeBase &Phi : HeaderVPBB->phis()) { auto *PhiR = dyn_cast(&Phi); From 66bea0df75ccdd5ffed41d06c7301a116d11abcb Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Sat, 8 Feb 2025 14:14:16 +0100 Subject: [PATCH 053/293] [llvm-objcopy] Fix prints wrong path when dump-section output path doesn't exist (#125345) Fix printing the correct file path in the error message when the output file specified by `--dump-section` cannot be opened Fixes: #125113 on ELF, MachO, Wasm --- llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp | 59 ++++++++++--------- llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp | 27 +++++---- llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp | 15 ++--- .../tools/llvm-objcopy/ELF/dump-section.test | 4 ++ .../llvm-objcopy/MachO/dump-section.test | 4 ++ .../tools/llvm-objcopy/wasm/dump-section.test | 4 ++ 6 files changed, 64 insertions(+), 49 deletions(-) diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp index 5aa0079f3fbc7..9c78f7433ad33 100644 --- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp +++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp @@ -186,27 +186,28 @@ static std::unique_ptr createWriter(const CommonConfig &Config, } static Error dumpSectionToFile(StringRef SecName, StringRef Filename, - Object &Obj) { + StringRef InputFilename, Object &Obj) { for (auto &Sec : Obj.sections()) { if (Sec.Name == SecName) { if (Sec.Type == SHT_NOBITS) - return createStringError(object_error::parse_failed, - "cannot dump section '%s': it has no contents", - SecName.str().c_str()); + return createFileError(InputFilename, object_error::parse_failed, + "cannot dump section '%s': it has no contents", + SecName.str().c_str()); Expected> BufferOrErr = FileOutputBuffer::create(Filename, Sec.OriginalData.size()); if (!BufferOrErr) - return BufferOrErr.takeError(); + return createFileError(Filename, BufferOrErr.takeError()); std::unique_ptr Buf = std::move(*BufferOrErr); std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(), Buf->getBufferStart()); if (Error E = Buf->commit()) - return E; + return createFileError(Filename, std::move(E)); return Error::success(); } } - return createStringError(object_error::parse_failed, "section '%s' not found", - SecName.str().c_str()); + + return createFileError(InputFilename, object_error::parse_failed, + "section '%s' not found", SecName.str().c_str()); } Error Object::compressOrDecompressSections(const CommonConfig &Config) { @@ -798,7 +799,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, StringRef SectionName; StringRef FileName; std::tie(SectionName, FileName) = Flag.split('='); - if (Error E = dumpSectionToFile(SectionName, FileName, Obj)) + if (Error E = + dumpSectionToFile(SectionName, FileName, Config.InputFilename, Obj)) return E; } @@ -807,10 +809,10 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, // us to avoid reporting the inappropriate errors about removing symbols // named in relocations. if (Error E = replaceAndRemoveSections(Config, ELFConfig, Obj)) - return E; + return createFileError(Config.InputFilename, std::move(E)); if (Error E = updateAndRemoveSymbols(Config, ELFConfig, Obj)) - return E; + return createFileError(Config.InputFilename, std::move(E)); if (!Config.SetSectionAlignment.empty()) { for (SectionBase &Sec : Obj.sections()) { @@ -826,8 +828,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, if (Config.ChangeSectionLMAValAll > 0 && Seg.PAddr > std::numeric_limits::max() - Config.ChangeSectionLMAValAll) { - return createStringError( - errc::invalid_argument, + return createFileError( + Config.InputFilename, errc::invalid_argument, "address 0x" + Twine::utohexstr(Seg.PAddr) + " cannot be increased by 0x" + Twine::utohexstr(Config.ChangeSectionLMAValAll) + @@ -835,8 +837,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, } else if (Config.ChangeSectionLMAValAll < 0 && Seg.PAddr < std::numeric_limits::min() - Config.ChangeSectionLMAValAll) { - return createStringError( - errc::invalid_argument, + return createFileError( + Config.InputFilename, errc::invalid_argument, "address 0x" + Twine::utohexstr(Seg.PAddr) + " cannot be decreased by 0x" + Twine::utohexstr(std::abs(Config.ChangeSectionLMAValAll)) + @@ -849,10 +851,9 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, if (!Config.ChangeSectionAddress.empty()) { if (Obj.Type != ELF::ET_REL) - return createStringError( - object_error::invalid_file_type, + return createFileError( + Config.InputFilename, object_error::invalid_file_type, "cannot change section address in a non-relocatable file"); - StringMap SectionsToUpdateAddress; for (const SectionPatternAddressUpdate &PatternUpdate : make_range(Config.ChangeSectionAddress.rbegin(), @@ -863,8 +864,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, .second) { if (PatternUpdate.Update.Kind == AdjustKind::Subtract && Sec.Addr < PatternUpdate.Update.Value) { - return createStringError( - errc::invalid_argument, + return createFileError( + Config.InputFilename, errc::invalid_argument, "address 0x" + Twine::utohexstr(Sec.Addr) + " cannot be decreased by 0x" + Twine::utohexstr(PatternUpdate.Update.Value) + @@ -873,8 +874,8 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, if (PatternUpdate.Update.Kind == AdjustKind::Add && Sec.Addr > std::numeric_limits::max() - PatternUpdate.Update.Value) { - return createStringError( - errc::invalid_argument, + return createFileError( + Config.InputFilename, errc::invalid_argument, "address 0x" + Twine::utohexstr(Sec.Addr) + " cannot be increased by 0x" + Twine::utohexstr(PatternUpdate.Update.Value) + @@ -909,7 +910,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, if (!ELFConfig.NotesToRemove.empty()) { if (Error Err = removeNotes(Obj, E, ELFConfig.NotesToRemove, Config.ErrorCallback)) - return Err; + return createFileError(Config.InputFilename, std::move(Err)); } for (const NewSectionInfo &AddedSection : Config.AddSection) { @@ -924,7 +925,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, return Error::success(); }; if (Error E = handleUserSection(AddedSection, AddSection)) - return E; + return createFileError(Config.InputFilename, std::move(E)); } for (const NewSectionInfo &NewSection : Config.UpdateSection) { @@ -932,7 +933,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, return Obj.updateSection(Name, Data); }; if (Error E = handleUserSection(NewSection, UpdateSection)) - return E; + return createFileError(Config.InputFilename, std::move(E)); } if (!Config.AddGnuDebugLink.empty()) @@ -943,7 +944,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, // before adding new symbols. if (!Obj.SymbolTable && !Config.SymbolsToAdd.empty()) if (Error E = Obj.addNewSymbolTable()) - return E; + return createFileError(Config.InputFilename, std::move(E)); for (const NewSymbolInfo &SI : Config.SymbolsToAdd) addSymbol(Obj, SI, ELFConfig.NewSymbolVisibility); @@ -955,7 +956,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, if (Iter != Config.SetSectionFlags.end()) { const SectionFlagsUpdate &SFU = Iter->second; if (Error E = setSectionFlagsAndType(Sec, SFU.NewFlags, Obj.Machine)) - return E; + return createFileError(Config.InputFilename, std::move(E)); } auto It2 = Config.SetSectionType.find(Sec.Name); if (It2 != Config.SetSectionType.end()) @@ -974,7 +975,7 @@ static Error handleArgs(const CommonConfig &Config, const ELFConfig &ELFConfig, Sec.Name = std::string(SR.NewName); if (SR.NewFlags) { if (Error E = setSectionFlagsAndType(Sec, *SR.NewFlags, Obj.Machine)) - return E; + return createFileError(Config.InputFilename, std::move(E)); } RenamedSections.insert(&Sec); } else if (RelocSec && !(Sec.Flags & SHF_ALLOC)) @@ -1091,7 +1092,7 @@ Error objcopy::elf::executeObjcopyOnBinary(const CommonConfig &Config, : getOutputElfType(In); if (Error E = handleArgs(Config, ELFConfig, OutputElfType, **Obj)) - return createFileError(Config.InputFilename, std::move(E)); + return E; if (Error E = writeOutput(Config, **Obj, Out, OutputElfType)) return createFileError(Config.InputFilename, std::move(E)); diff --git a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp index a188425b283fa..682edffc84f34 100644 --- a/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp +++ b/llvm/lib/ObjCopy/MachO/MachOObjcopy.cpp @@ -306,25 +306,25 @@ static Error processLoadCommands(const MachOConfig &MachOConfig, Object &Obj) { } static Error dumpSectionToFile(StringRef SecName, StringRef Filename, - Object &Obj) { + StringRef InputFilename, Object &Obj) { for (LoadCommand &LC : Obj.LoadCommands) for (const std::unique_ptr
&Sec : LC.Sections) { if (Sec->CanonicalName == SecName) { Expected> BufferOrErr = FileOutputBuffer::create(Filename, Sec->Content.size()); if (!BufferOrErr) - return BufferOrErr.takeError(); + return createFileError(Filename, BufferOrErr.takeError()); std::unique_ptr Buf = std::move(*BufferOrErr); llvm::copy(Sec->Content, Buf->getBufferStart()); if (Error E = Buf->commit()) - return E; + return createFileError(Filename, std::move(E)); return Error::success(); } } - return createStringError(object_error::parse_failed, "section '%s' not found", - SecName.str().c_str()); + return createFileError(InputFilename, object_error::parse_failed, + "section '%s' not found", SecName.str().c_str()); } static Error addSection(const NewSectionInfo &NewSection, Object &Obj) { @@ -426,12 +426,13 @@ static Error handleArgs(const CommonConfig &Config, StringRef SectionName; StringRef FileName; std::tie(SectionName, FileName) = Flag.split('='); - if (Error E = dumpSectionToFile(SectionName, FileName, Obj)) + if (Error E = + dumpSectionToFile(SectionName, FileName, Config.InputFilename, Obj)) return E; } if (Error E = removeSections(Config, Obj)) - return E; + return createFileError(Config.InputFilename, std::move(E)); // Mark symbols to determine which symbols are still needed. if (Config.StripAll) @@ -446,20 +447,20 @@ static Error handleArgs(const CommonConfig &Config, for (const NewSectionInfo &NewSection : Config.AddSection) { if (Error E = isValidMachOCannonicalName(NewSection.SectionName)) - return E; + return createFileError(Config.InputFilename, std::move(E)); if (Error E = addSection(NewSection, Obj)) - return E; + return createFileError(Config.InputFilename, std::move(E)); } for (const NewSectionInfo &NewSection : Config.UpdateSection) { if (Error E = isValidMachOCannonicalName(NewSection.SectionName)) - return E; + return createFileError(Config.InputFilename, std::move(E)); if (Error E = updateSection(NewSection, Obj)) - return E; + return createFileError(Config.InputFilename, std::move(E)); } if (Error E = processLoadCommands(MachOConfig, Obj)) - return E; + return createFileError(Config.InputFilename, std::move(E)); return Error::success(); } @@ -479,7 +480,7 @@ Error objcopy::macho::executeObjcopyOnBinary(const CommonConfig &Config, Config.InputFilename.str().c_str()); if (Error E = handleArgs(Config, MachOConfig, **O)) - return createFileError(Config.InputFilename, std::move(E)); + return E; // Page size used for alignment of segment sizes in Mach-O executables and // dynamic libraries. diff --git a/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp index cf3d884bee3bd..57fd0f5ad233c 100644 --- a/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp +++ b/llvm/lib/ObjCopy/wasm/WasmObjcopy.cpp @@ -38,23 +38,23 @@ static bool isCommentSection(const Section &Sec) { } static Error dumpSectionToFile(StringRef SecName, StringRef Filename, - Object &Obj) { + StringRef InputFilename, Object &Obj) { for (const Section &Sec : Obj.Sections) { if (Sec.Name == SecName) { ArrayRef Contents = Sec.Contents; Expected> BufferOrErr = FileOutputBuffer::create(Filename, Contents.size()); if (!BufferOrErr) - return BufferOrErr.takeError(); + return createFileError(Filename, BufferOrErr.takeError()); std::unique_ptr Buf = std::move(*BufferOrErr); std::copy(Contents.begin(), Contents.end(), Buf->getBufferStart()); if (Error E = Buf->commit()) - return E; + return createFileError(Filename, std::move(E)); return Error::success(); } } - return createStringError(errc::invalid_argument, "section '%s' not found", - SecName.str().c_str()); + return createFileError(Filename, errc::invalid_argument, + "section '%s' not found", SecName.str().c_str()); } static void removeSections(const CommonConfig &Config, Object &Obj) { @@ -115,8 +115,9 @@ static Error handleArgs(const CommonConfig &Config, Object &Obj) { StringRef SecName; StringRef FileName; std::tie(SecName, FileName) = Flag.split("="); - if (Error E = dumpSectionToFile(SecName, FileName, Obj)) - return createFileError(FileName, std::move(E)); + if (Error E = + dumpSectionToFile(SecName, FileName, Config.InputFilename, Obj)) + return E; } removeSections(Config, Obj); diff --git a/llvm/test/tools/llvm-objcopy/ELF/dump-section.test b/llvm/test/tools/llvm-objcopy/ELF/dump-section.test index 037ec86090e55..2dbbcc0ca568e 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/dump-section.test +++ b/llvm/test/tools/llvm-objcopy/ELF/dump-section.test @@ -64,3 +64,7 @@ ProgramHeaders: # RUN: not llvm-objcopy --dump-section .text= %t /dev/null 2>&1 | FileCheck %s --check-prefix=ERR2 # ERR2: error: bad format for --dump-section, expected section=file + +# RUN: not llvm-objcopy --dump-section .text=not_exists/text-section %t 2>&1 \ +# RUN: | FileCheck -DMSG=%errc_ENOENT %s -DINPUT=%t --check-prefix=NO-SUCH-PATH +# NO-SUCH-PATH: error: 'not_exists/text-section': [[MSG]] diff --git a/llvm/test/tools/llvm-objcopy/MachO/dump-section.test b/llvm/test/tools/llvm-objcopy/MachO/dump-section.test index 9a1227cdbbda1..d54a50b557bb7 100644 --- a/llvm/test/tools/llvm-objcopy/MachO/dump-section.test +++ b/llvm/test/tools/llvm-objcopy/MachO/dump-section.test @@ -21,6 +21,10 @@ # RUN: | FileCheck %s -DINPUT=%t --check-prefix=NO-SUCH-SECTION # NO-SUCH-SECTION: error: '[[INPUT]]': section '__TEXT,__foo' not found +# RUN: not llvm-objcopy --dump-section __TEXT,__text=not_exists/text-section %t 2>&1 \ +# RUN: | FileCheck -DMSG=%errc_ENOENT %s -DINPUT=%t --check-prefix=NO-SUCH-PATH +# NO-SUCH-PATH: error: 'not_exists/text-section': [[MSG]] + --- !mach-o FileHeader: magic: 0xFEEDFACF diff --git a/llvm/test/tools/llvm-objcopy/wasm/dump-section.test b/llvm/test/tools/llvm-objcopy/wasm/dump-section.test index 983a581e03fe2..2d1533f06df10 100644 --- a/llvm/test/tools/llvm-objcopy/wasm/dump-section.test +++ b/llvm/test/tools/llvm-objcopy/wasm/dump-section.test @@ -28,6 +28,10 @@ # REMOVED-NOT: producers +# RUN: not llvm-objcopy --dump-section producers=not_exists/text-section %t 2>&1 \ +# RUN: | FileCheck -DMSG=%errc_ENOENT %s -DINPUT=%t --check-prefix=NO-SUCH-PATH +# NO-SUCH-PATH: error: 'not_exists/text-section': [[MSG]] + --- !WASM FileHeader: Version: 0x00000001 From ef23ba7da34ca1285f10603cc4aa6441ab4530e6 Mon Sep 17 00:00:00 2001 From: Guy David <49722543+guy-david@users.noreply.github.com> Date: Sat, 8 Feb 2025 15:16:26 +0200 Subject: [PATCH 054/293] [Support] Re-raise external signals (#125854) Otherwise, the handler "swallows" the signal and the process continues to execute. While this use case is peculiar, ignoring these signals entirely seems more odd. --- llvm/lib/Support/Unix/Signals.inc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc index 2e7b467a14bbe..30e5f40193974 100644 --- a/llvm/lib/Support/Unix/Signals.inc +++ b/llvm/lib/Support/Unix/Signals.inc @@ -80,7 +80,7 @@ using namespace llvm; -static void SignalHandler(int Sig); // defined below. +static void SignalHandler(int Sig, siginfo_t *Info, void *); static void InfoSignalHandler(int Sig); // defined below. using SignalHandlerFunctionType = void (*)(); @@ -313,8 +313,8 @@ static void RegisterHandlers() { // Not signal-safe. switch (Kind) { case SignalKind::IsKill: - NewHandler.sa_handler = SignalHandler; - NewHandler.sa_flags = SA_NODEFER | SA_RESETHAND | SA_ONSTACK; + NewHandler.sa_sigaction = SignalHandler; + NewHandler.sa_flags = SA_NODEFER | SA_RESETHAND | SA_ONSTACK | SA_SIGINFO; break; case SignalKind::IsInfo: NewHandler.sa_handler = InfoSignalHandler; @@ -370,7 +370,7 @@ void sys::CleanupOnSignal(uintptr_t Context) { } // The signal handler that runs. -static void SignalHandler(int Sig) { +static void SignalHandler(int Sig, siginfo_t *Info, void *) { // Restore the signal behavior to default, so that the program actually // crashes when we return and the signal reissues. This also ensures that if // we crash in our signal handler that the program will terminate immediately @@ -412,6 +412,11 @@ static void SignalHandler(int Sig) { if (Sig == SIGILL || Sig == SIGFPE || Sig == SIGTRAP) raise(Sig); #endif + + // Signal sent from another process, do not assume that continuing the + // execution would re-raise it. + if (Info->si_pid != getpid()) + raise(Sig); } static void InfoSignalHandler(int Sig) { From 4e29148cca3fac0f1ffb1fbfbe3bbbd489859897 Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Sat, 8 Feb 2025 22:22:42 +0900 Subject: [PATCH 055/293] [CodeGen][XCore] Replace PointerType::getUnqual(Type) with opaque version (NFC) (#126279) Follow-up to #123569 --- clang/lib/CodeGen/Targets/XCore.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/Targets/XCore.cpp b/clang/lib/CodeGen/Targets/XCore.cpp index ced4981fd124f..b7824bde5f55a 100644 --- a/clang/lib/CodeGen/Targets/XCore.cpp +++ b/clang/lib/CodeGen/Targets/XCore.cpp @@ -149,7 +149,7 @@ RValue XCoreABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, llvm::Type *ArgTy = CGT.ConvertType(Ty); if (AI.canHaveCoerceToType() && !AI.getCoerceToType()) AI.setCoerceToType(ArgTy); - llvm::Type *ArgPtrTy = llvm::PointerType::getUnqual(ArgTy); + llvm::Type *ArgPtrTy = llvm::PointerType::getUnqual(ArgTy->getContext()); Address Val = Address::invalid(); CharUnits ArgSize = CharUnits::Zero(); From 54e0c2bbe2b36b08772ca6e5e3f176d7caf116bd Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Sat, 8 Feb 2025 22:22:53 +0900 Subject: [PATCH 056/293] [CodeGen][SystemZ] Replace PointerType::getUnqual(Type) with opaque version (NFC) (#126280) Follow-up to #126278 --- clang/lib/CodeGen/Targets/SystemZ.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp index 23c96fa5cf98c..9bb8ddbc548d2 100644 --- a/clang/lib/CodeGen/Targets/SystemZ.cpp +++ b/clang/lib/CodeGen/Targets/SystemZ.cpp @@ -272,7 +272,7 @@ RValue SystemZABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, SZCGI.handleExternallyVisibleObjABI(Ty.getTypePtr(), CGT.getCGM(), /*IsParam*/true); if (IsIndirect) { - DirectTy = llvm::PointerType::getUnqual(DirectTy); + DirectTy = llvm::PointerType::getUnqual(DirectTy->getContext()); UnpaddedSize = DirectAlign = CharUnits::fromQuantity(8); } else { if (AI.getCoerceToType()) From df2e8ee7ae349364967a1a2d09f17b249a38c04d Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Sat, 8 Feb 2025 22:23:08 +0900 Subject: [PATCH 057/293] [CodeGen][AArch64] Replace PointerType::getUnqual(Type) with opaque version (NFC) (#126278) Follow-up to #123569 --- clang/lib/CodeGen/Targets/AArch64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index 4922b082cf09c..dc3a1d4287be1 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -843,7 +843,7 @@ RValue AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty, llvm::Type *BaseTy = CGF.ConvertType(Ty); if (IsIndirect) - BaseTy = llvm::PointerType::getUnqual(BaseTy); + BaseTy = llvm::PointerType::getUnqual(BaseTy->getContext()); else if (AI.getCoerceToType()) BaseTy = AI.getCoerceToType(); @@ -961,7 +961,7 @@ RValue AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty, if (IsIndirect) { // If it's been passed indirectly (actually a struct), whatever we find from // stored registers or on the stack will actually be a struct **. - MemTy = llvm::PointerType::getUnqual(MemTy); + MemTy = llvm::PointerType::getUnqual(MemTy->getContext()); } const Type *Base = nullptr; From 6ff8a06de9ce125023e117014ce4dca8fcc391d7 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 8 Feb 2025 13:33:46 +0000 Subject: [PATCH 058/293] [VPlan] Run recipe removal and simplification after optimizeForVFAndUF. (#125926) Run recipe simplification and dead recipe removal after VPlan-based unrolling and optimizeForVFAndUF, to clean up any redundant or dead recipes introduced by them. Currently this is NFC, as it removes the corresponding removeDeadRecipes run in optimizeForVFAndUF and no additional simplifications kick in after unrolling yet. That is changing with https://github.com/llvm/llvm-project/pull/123655. Note that with this change, pattern-matching is now applied after EVL-based recipes have been introduced. Trying to match VPWidenEVLRecipe when not explicitly requested might apply a pattern with 2 operands to one with 3 due to the extra EVL operand and VPWidenEVLRecipe being a subclass of VPWidenRecipe. To prevent this, update Recipe_match::match to only match VPWidenEVLRecipe if it is in the requested recipe types (RecipeTy). PR: https://github.com/llvm/llvm-project/pull/125926 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 ++ llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h | 8 ++++++++ llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 5 +---- llvm/lib/Transforms/Vectorize/VPlanTransforms.h | 4 ++++ 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4a03d6e9a3ced..dacee6445072a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7672,6 +7672,8 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF, OrigLoop->getHeader()->getContext()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); + VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType()); + VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); // Perform the actual loop transformation. diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 8c11d93734667..ebc82c01467cf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -155,6 +155,14 @@ struct Recipe_match { if ((!matchRecipeAndOpcode(R) && ...)) return false; + if (!(std::is_same_v || ...) && + isa(R)) { + // Don't match VPWidenEVLRecipe if it is not explicitly part of RecipeTys. + // Otherwise we might match it unexpectedly when trying to match + // VPWidenRecipe, of which VPWidenEVLRecipe is a subclass of. + return false; + } + assert(R->getNumOperands() == std::tuple_size::value && "recipe with matched opcode the expected number of operands"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1380946fd6b4d..6c917e4eef655 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -964,9 +964,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1)); } -/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all -/// un-typed live-ins in VPTypeAnalysis. -static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { +void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { ReversePostOrderTraversal> RPOT( Plan.getEntry()); VPTypeAnalysis TypeInfo(&CanonicalIVTy); @@ -1043,7 +1041,6 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, } Term->eraseFromParent(); - VPlanTransforms::removeDeadRecipes(Plan); Plan.setVF(BestVF); Plan.setUF(BestUF); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 0cd4cf1f22a7d..3dd476a8526d6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -163,6 +163,10 @@ struct VPlanTransforms { /// Lower abstract recipes to concrete ones, that can be codegen'd. static void convertToConcreteRecipes(VPlan &Plan); + /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p + /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. + static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// If there's a single exit block, optimize its phi recipes that use exiting /// IV values by feeding them precomputed end values instead, possibly taken /// one step backwards. From e0fee55a5549e04bb14d45fba6267bd69285ce77 Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Sun, 9 Feb 2025 00:13:02 +0900 Subject: [PATCH 059/293] [CodeGen] Replace of PointerType::get(Type) with opaque version (NFC) (#124771) Follow-up to https://github.com/llvm/llvm-project/issues/123569 --- clang/lib/CodeGen/Address.h | 5 +---- clang/lib/CodeGen/CGBlocks.cpp | 23 +---------------------- clang/lib/CodeGen/CGDecl.cpp | 8 ++------ clang/lib/CodeGen/CGDeclCXX.cpp | 10 ++-------- clang/lib/CodeGen/CGExpr.cpp | 4 ++-- clang/lib/CodeGen/CGObjCMac.cpp | 2 +- clang/lib/CodeGen/CodeGenModule.cpp | 6 +++--- 7 files changed, 12 insertions(+), 46 deletions(-) diff --git a/clang/lib/CodeGen/Address.h b/clang/lib/CodeGen/Address.h index a18c7169af1eb..a748ddaa110a5 100644 --- a/clang/lib/CodeGen/Address.h +++ b/clang/lib/CodeGen/Address.h @@ -197,10 +197,7 @@ class Address { /// Return the type of the pointer value. llvm::PointerType *getType() const { - return llvm::PointerType::get( - ElementType, - llvm::cast(Pointer.getPointer()->getType()) - ->getAddressSpace()); + return llvm::cast(Pointer.getPointer()->getType()); } /// Return the type of the values stored in this address. diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp index aaba354c08547..faef6a5fbe1f5 100644 --- a/clang/lib/CodeGen/CGBlocks.cpp +++ b/clang/lib/CodeGen/CGBlocks.cpp @@ -1097,31 +1097,10 @@ llvm::Type *CodeGenModule::getBlockDescriptorType() { if (BlockDescriptorType) return BlockDescriptorType; - llvm::Type *UnsignedLongTy = - getTypes().ConvertType(getContext().UnsignedLongTy); - - // struct __block_descriptor { - // unsigned long reserved; - // unsigned long block_size; - // - // // later, the following will be added - // - // struct { - // void (*copyHelper)(); - // void (*copyHelper)(); - // } helpers; // !!! optional - // - // const char *signature; // the block signature - // const char *layout; // reserved - // }; - BlockDescriptorType = llvm::StructType::create( - "struct.__block_descriptor", UnsignedLongTy, UnsignedLongTy); - - // Now form a pointer to that. unsigned AddrSpace = 0; if (getLangOpts().OpenCL) AddrSpace = getContext().getTargetAddressSpace(LangAS::opencl_constant); - BlockDescriptorType = llvm::PointerType::get(BlockDescriptorType, AddrSpace); + BlockDescriptorType = llvm::PointerType::get(getLLVMContext(), AddrSpace); return BlockDescriptorType; } diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index cc6815db4d20f..668282a6ab1a8 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -2870,15 +2870,12 @@ void CodeGenModule::EmitOMPAllocateDecl(const OMPAllocateDecl *D) { // We can also keep the existing global if the address space is what we // expect it to be, if not, it is replaced. - QualType ASTTy = VD->getType(); clang::LangAS GVAS = GetGlobalVarAddressSpace(VD); auto TargetAS = getContext().getTargetAddressSpace(GVAS); if (Entry->getType()->getAddressSpace() == TargetAS) continue; - // Make a new global with the correct type / address space. - llvm::Type *Ty = getTypes().ConvertTypeForMem(ASTTy); - llvm::PointerType *PTy = llvm::PointerType::get(Ty, TargetAS); + llvm::PointerType *PTy = llvm::PointerType::get(getLLVMContext(), TargetAS); // Replace all uses of the old global with a cast. Since we mutate the type // in place we neeed an intermediate that takes the spot of the old entry @@ -2891,8 +2888,7 @@ void CodeGenModule::EmitOMPAllocateDecl(const OMPAllocateDecl *D) { Entry->mutateType(PTy); llvm::Constant *NewPtrForOldDecl = - llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - Entry, DummyGV->getType()); + llvm::ConstantExpr::getAddrSpaceCast(Entry, DummyGV->getType()); // Now we have a casted version of the changed global, the dummy can be // replaced and deleted. diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index 1c2fecea1a6ac..f5950f03673a1 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -345,10 +345,7 @@ void CodeGenFunction::registerGlobalDtorWithLLVM(const VarDecl &VD, void CodeGenFunction::registerGlobalDtorWithAtExit(llvm::Constant *dtorStub) { // extern "C" int atexit(void (*f)(void)); - assert(dtorStub->getType() == - llvm::PointerType::get( - llvm::FunctionType::get(CGM.VoidTy, false), - dtorStub->getType()->getPointerAddressSpace()) && + assert(dtorStub->getType()->isPointerTy() && "Argument to atexit has a wrong type."); llvm::FunctionType *atexitTy = @@ -372,10 +369,7 @@ CodeGenFunction::unregisterGlobalDtorWithUnAtExit(llvm::Constant *dtorStub) { // value is returned. // // extern "C" int unatexit(void (*f)(void)); - assert(dtorStub->getType() == - llvm::PointerType::get( - llvm::FunctionType::get(CGM.VoidTy, false), - dtorStub->getType()->getPointerAddressSpace()) && + assert(dtorStub->getType()->isPointerTy() && "Argument to unatexit has a wrong type."); llvm::FunctionType *unatexitTy = diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 1e233c42c8782..2bbc0791c6587 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -872,7 +872,7 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc, llvm::Value *TypeHash = llvm::ConstantInt::get(Int64Ty, xxh3_64bits(Out.str())); - llvm::Type *VPtrTy = llvm::PointerType::get(IntPtrTy, 0); + llvm::Type *VPtrTy = llvm::PointerType::get(getLLVMContext(), 0); Address VPtrAddr(Ptr, IntPtrTy, getPointerAlign()); llvm::Value *VPtrVal = GetVTablePtr(VPtrAddr, VPtrTy, Ty->getAsCXXRecordDecl(), @@ -3054,7 +3054,7 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) { getContext().getDeclAlign(VD)); llvm::Type *VarTy = getTypes().ConvertTypeForMem(VD->getType()); auto *PTy = llvm::PointerType::get( - VarTy, getTypes().getTargetAddressSpace(VD->getType())); + getLLVMContext(), getTypes().getTargetAddressSpace(VD->getType())); Addr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PTy, VarTy); } else { // Should we be using the alignment of the constant pointer we emitted? diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index dd900f9b32fb7..6c929a6431c0f 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -5717,7 +5717,7 @@ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) IntTy = CGM.IntTy; LongTy = cast(Types.ConvertType(Ctx.LongTy)); Int8PtrTy = CGM.Int8PtrTy; - Int8PtrProgramASTy = llvm::PointerType::get(CGM.Int8Ty, ProgramAS); + Int8PtrProgramASTy = llvm::PointerType::get(CGM.getLLVMContext(), ProgramAS); Int8PtrPtrTy = CGM.Int8PtrPtrTy; // arm64 targets use "int" ivar offset variables. All others, diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 47c03ea5e72cb..c056d103a7fe4 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -4432,7 +4432,7 @@ void CodeGenModule::emitCPUDispatchDefinition(GlobalDecl GD) { GlobalDecl ResolverGD; if (getTarget().supportsIFunc()) { ResolverType = llvm::FunctionType::get( - llvm::PointerType::get(DeclTy, + llvm::PointerType::get(getLLVMContext(), getTypes().getTargetAddressSpace(FD->getType())), false); } @@ -4604,8 +4604,8 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) { // cpu_dispatch will be emitted in this translation unit. if (ShouldReturnIFunc) { unsigned AS = getTypes().getTargetAddressSpace(FD->getType()); - llvm::Type *ResolverType = - llvm::FunctionType::get(llvm::PointerType::get(DeclTy, AS), false); + llvm::Type *ResolverType = llvm::FunctionType::get( + llvm::PointerType::get(getLLVMContext(), AS), false); llvm::Constant *Resolver = GetOrCreateLLVMFunction( MangledName + ".resolver", ResolverType, GlobalDecl{}, /*ForVTable=*/false); From a07928c3ce9da62b82a796ef26f5f7aaa0311d37 Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Sun, 9 Feb 2025 00:13:23 +0900 Subject: [PATCH 060/293] [CodeGen][Hexagon] Replace PointerType::getUnqual(Type) with opaque version (NFC) (#126274) Follow-up to https://github.com/llvm/llvm-project/issues/123569 The obsolete bitcasts on the LoadInsts are also removed. --- clang/lib/CodeGen/Targets/Hexagon.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/clang/lib/CodeGen/Targets/Hexagon.cpp b/clang/lib/CodeGen/Targets/Hexagon.cpp index 8fd2a81494d99..aada8d0d61303 100644 --- a/clang/lib/CodeGen/Targets/Hexagon.cpp +++ b/clang/lib/CodeGen/Targets/Hexagon.cpp @@ -336,10 +336,6 @@ Address HexagonABIInfo::EmitVAArgForHexagonLinux(CodeGenFunction &CGF, // Implement the block where argument is in register saved area CGF.EmitBlock(InRegBlock); - llvm::Type *PTy = CGF.ConvertType(Ty); - llvm::Value *__saved_reg_area_p = CGF.Builder.CreateBitCast( - __current_saved_reg_area_pointer, llvm::PointerType::getUnqual(PTy)); - CGF.Builder.CreateStore(__new_saved_reg_area_pointer, __current_saved_reg_area_pointer_p); @@ -388,22 +384,16 @@ Address HexagonABIInfo::EmitVAArgForHexagonLinux(CodeGenFunction &CGF, CGF.Builder.CreateStore(__new_overflow_area_pointer, __current_saved_reg_area_pointer_p); - // Bitcast the overflow area pointer to the type of argument. - llvm::Type *OverflowPTy = CGF.ConvertTypeForMem(Ty); - llvm::Value *__overflow_area_p = CGF.Builder.CreateBitCast( - __overflow_area_pointer, llvm::PointerType::getUnqual(OverflowPTy)); - CGF.EmitBranch(ContBlock); - // Get the correct pointer to load the variable argument // Implement the ContBlock CGF.EmitBlock(ContBlock); llvm::Type *MemTy = CGF.ConvertTypeForMem(Ty); - llvm::Type *MemPTy = llvm::PointerType::getUnqual(MemTy); - llvm::PHINode *ArgAddr = CGF.Builder.CreatePHI(MemPTy, 2, "vaarg.addr"); - ArgAddr->addIncoming(__saved_reg_area_p, InRegBlock); - ArgAddr->addIncoming(__overflow_area_p, OnStackBlock); + llvm::PHINode *ArgAddr = CGF.Builder.CreatePHI( + llvm::PointerType::getUnqual(MemTy->getContext()), 2, "vaarg.addr"); + ArgAddr->addIncoming(__current_saved_reg_area_pointer, InRegBlock); + ArgAddr->addIncoming(__overflow_area_pointer, OnStackBlock); return Address(ArgAddr, MemTy, CharUnits::fromQuantity(ArgAlign)); } From 101b3ff7af8fabe4ec5c06219a70094c1d901c49 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Sat, 8 Feb 2025 07:59:55 -0800 Subject: [PATCH 061/293] [RISCV][NFC] Adopt DiagnosticString interface (#126290) --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 63 ++----------------- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 6 ++ llvm/lib/Target/RISCV/RISCVInstrInfoV.td | 2 + llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td | 1 + llvm/lib/Target/RISCV/RISCVInstrInfoZc.td | 3 + llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td | 2 + 6 files changed, 19 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index ea6ca3b8f9a2d..ac87d72b7595c 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -280,7 +280,7 @@ class RISCVAsmParser : public MCTargetAsmParser { std::unique_ptr defaultFRMArgLegacyOp() const; public: - enum RISCVMatchResultTy { + enum RISCVMatchResultTy : unsigned { Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY, #define GET_OPERAND_DIAGNOSTIC_TYPES #include "RISCVGenAsmMatcher.inc" @@ -1527,10 +1527,6 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, std::numeric_limits::max(), "operand either must be a bare symbol name or an immediate integer in " "the range"); - case Match_InvalidImmZero: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "immediate must be zero"); - } case Match_InvalidUImmLog2XLen: if (isRV64()) return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 6) - 1); @@ -1657,47 +1653,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, "operand must be a valid system register " "name or an integer in the range"); } - case Match_InvalidLoadFPImm: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be a valid floating-point constant"); - } - case Match_InvalidBareSymbol: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be a bare symbol name"); - } - case Match_InvalidPseudoJumpSymbol: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be a valid jump target"); - } - case Match_InvalidCallSymbol: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be a bare symbol name"); - } - case Match_InvalidTPRelAddSymbol: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be a symbol with %tprel_add modifier"); - } - case Match_InvalidTLSDESCCallSymbol: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, - "operand must be a symbol with %tlsdesc_call modifier"); - } - case Match_InvalidRTZArg: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be 'rtz' floating-point rounding mode"); - } case Match_InvalidVTypeI: { SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); return generateVTypeError(ErrorLoc); } - case Match_InvalidVMaskRegister: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be v0.t"); - } - case Match_InvalidVMaskCarryInRegister: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be v0"); - } case Match_InvalidSImm5Plus1: { return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 4) + 1, (1 << 4), @@ -1706,26 +1665,14 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidSImm26: return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 25), (1 << 25) - 1); - case Match_InvalidRlist: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error( - ErrorLoc, - "operand must be {ra [, s0[-sN]]} or {x1 [, x8[-x9][, x18[-xN]]]}"); - } - case Match_InvalidStackAdj: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error( - ErrorLoc, - "stack adjustment is invalid for this instruction and register list; " - "refer to Zc spec for a detailed range of stack adjustment"); - } case Match_InvalidRnumArg: { return generateImmOutOfRangeError(Operands, ErrorInfo, 0, 10); } - case Match_InvalidRegReg: { - SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operands must be register and register"); } + + if (const char *MatchDiag = getMatchKindDiag((RISCVMatchResultTy)Result)) { + SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc(); + return Error(ErrorLoc, MatchDiag); } llvm_unreachable("Unknown match type detected!"); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 54fee1ac3130e..fde7dc89dd693 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -125,6 +125,7 @@ def ImmZeroAsmOperand : AsmOperandClass { let Name = "ImmZero"; let RenderMethod = "addImmOperands"; let DiagnosticType = !strconcat("Invalid", Name); + let DiagnosticString = "immediate must be zero"; } // A parse method for (${gpr}) or 0(${gpr}), where the 0 is be silently ignored. @@ -323,6 +324,7 @@ def BareSymbol : AsmOperandClass { let Name = "BareSymbol"; let RenderMethod = "addImmOperands"; let DiagnosticType = "InvalidBareSymbol"; + let DiagnosticString = "operand must be a bare symbol name"; let ParserMethod = "parseBareSymbol"; } @@ -335,6 +337,7 @@ def CallSymbol : AsmOperandClass { let Name = "CallSymbol"; let RenderMethod = "addImmOperands"; let DiagnosticType = "InvalidCallSymbol"; + let DiagnosticString = "operand must be a bare symbol name"; let ParserMethod = "parseCallSymbol"; } @@ -347,6 +350,7 @@ def PseudoJumpSymbol : AsmOperandClass { let Name = "PseudoJumpSymbol"; let RenderMethod = "addImmOperands"; let DiagnosticType = "InvalidPseudoJumpSymbol"; + let DiagnosticString = "operand must be a valid jump target"; let ParserMethod = "parsePseudoJumpSymbol"; } @@ -359,6 +363,7 @@ def TPRelAddSymbol : AsmOperandClass { let Name = "TPRelAddSymbol"; let RenderMethod = "addImmOperands"; let DiagnosticType = "InvalidTPRelAddSymbol"; + let DiagnosticString = "operand must be a symbol with %tprel_add modifier"; let ParserMethod = "parseOperandWithModifier"; } @@ -1779,6 +1784,7 @@ def TLSDESCCallSymbol : AsmOperandClass { let Name = "TLSDESCCallSymbol"; let RenderMethod = "addImmOperands"; let DiagnosticType = "InvalidTLSDESCCallSymbol"; + let DiagnosticString = "operand must be a symbol with %tlsdesc_call modifier"; let ParserMethod = "parseOperandWithModifier"; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 671e493fb3763..fdb2334b131da 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -48,6 +48,7 @@ def VMaskAsmOperand : AsmOperandClass { let IsOptional = 1; let DefaultMethod = "defaultMaskRegOp"; let DiagnosticType = "InvalidVMaskRegister"; + let DiagnosticString = "operand must be v0.t"; } def VMaskCarryInAsmOperand : AsmOperandClass { @@ -55,6 +56,7 @@ def VMaskCarryInAsmOperand : AsmOperandClass { let RenderMethod = "addRegOperands"; let PredicateMethod = "isV0Reg"; let DiagnosticType = "InvalidVMaskCarryInRegister"; + let DiagnosticString = "operand must be v0"; } def VMaskOp : RegisterOperand { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td index b98934d8c6396..bade4863ad348 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td @@ -514,6 +514,7 @@ def CVrrAsmOperand : AsmOperandClass { let Name = "RegReg"; let ParserMethod = "parseRegReg"; let DiagnosticType = "InvalidRegReg"; + let DiagnosticString = "operands must be register and register"; } def CVrr : Operand, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td index 5cc16765d4ae2..9dfbcf678d6eb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td @@ -39,12 +39,15 @@ def RlistAsmOperand : AsmOperandClass { let Name = "Rlist"; let ParserMethod = "parseReglist"; let DiagnosticType = "InvalidRlist"; + let DiagnosticString = "operand must be {ra [, s0[-sN]]} or {x1 [, x8[-x9][, x18[-xN]]]}"; } def StackAdjAsmOperand : AsmOperandClass { let Name = "StackAdj"; let ParserMethod = "parseZcmpStackAdj"; let DiagnosticType = "InvalidStackAdj"; + let DiagnosticString = "stack adjustment is invalid for this instruction and register list; " + "refer to Zc spec for a detailed range of stack adjustment"; let PredicateMethod = "isSpimm"; let RenderMethod = "addSpimmOperands"; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td index ab54b45f4de93..a539ca82b7462 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfa.td @@ -30,6 +30,7 @@ def LoadFPImmOperand : AsmOperandClass { let ParserMethod = "parseFPImm"; let RenderMethod = "addFPImmOperands"; let DiagnosticType = "InvalidLoadFPImm"; + let DiagnosticString = "operand must be a valid floating-point constant"; } def loadfpimm : Operand { @@ -43,6 +44,7 @@ def RTZArg : AsmOperandClass { let Name = "RTZArg"; let RenderMethod = "addFRMArgOperands"; let DiagnosticType = "InvalidRTZArg"; + let DiagnosticString = "operand must be 'rtz' floating-point rounding mode"; let ParserMethod = "parseFRMArg"; } From 2feced1df0aa01f78501720b98faa985bcec846a Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Sat, 8 Feb 2025 21:34:40 +0530 Subject: [PATCH 062/293] [MLIR][NVVM] Add tcgen05 wait/fence Ops (#126265) PR #126091 adds intrinsics for tcgen05 wait/fence/commit operations. This patch adds NVVM Dialect Ops for them. Signed-off-by: Durgadoss R --- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 109 ++++++++++++++++++ mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 30 +++++ .../Target/LLVMIR/nvvm/tcgen05-barriers.mlir | 56 +++++++++ 3 files changed, 195 insertions(+) create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 11226dae2c3f3..fe15a524ec3b5 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -2617,6 +2617,30 @@ def Tcgen05GroupKindAttr : let assemblyFormat = "`<` $value `>`"; } +def Tcgen05FenceBefore : I32EnumAttrCase<"BEFORE_THREAD_SYNC", 0, "before">; +def Tcgen05FenceAfter : I32EnumAttrCase<"AFTER_THREAD_SYNC", 1, "after">; +def Tcgen05FenceKind : I32EnumAttr<"Tcgen05FenceKind", "NVVM Tcgen05 fence kind", + [Tcgen05FenceBefore, Tcgen05FenceAfter]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def Tcgen05FenceKindAttr : + EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + +def Tcgen05WaitLoad : I32EnumAttrCase<"LOAD", 0, "load">; +def Tcgen05WaitStore : I32EnumAttrCase<"STORE", 1, "store">; +def Tcgen05WaitKind : I32EnumAttr<"Tcgen05WaitKind", "NVVM Tcgen05 wait kind", + [Tcgen05WaitLoad, Tcgen05WaitStore]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} +def Tcgen05WaitKindAttr : + EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> { let summary = "Tcgen05 alloc operation"; let description = [{ @@ -2701,6 +2725,91 @@ def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_perm }]; } +def NVVM_Tcgen05FenceOp : NVVM_Op<"tcgen05.fence"> { + let summary = "Tcgen05 fence operations"; + let description = [{ + The `tcgen05.fence` orders all prior async tcgen05 operations + with respect to the subsequent tcgen05 and execution ordering operations. + The `tcgen05.fence` orders all subsequent async tcgen05 operations + with respect to the prior tcgen05 and execution ordering operations. + + [For more information refer to the PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence) + }]; + + let arguments = (ins Tcgen05FenceKindAttr:$kind); + let assemblyFormat = "$kind attr-dict"; + + string llvmBuilder = [{ + auto id = ($kind == NVVM::Tcgen05FenceKind::BEFORE_THREAD_SYNC) + ? llvm::Intrinsic::nvvm_tcgen05_fence_before_thread_sync + : llvm::Intrinsic::nvvm_tcgen05_fence_after_thread_sync; + createIntrinsicCall(builder, id); + }]; +} + +def NVVM_Tcgen05WaitOp : NVVM_Op<"tcgen05.wait"> { + let summary = "Tcgen05 wait operations"; + let description = [{ + The `tcgen05.wait` causes the executing thread to block until + all prior `tcgen05.ld` operations issued by the executing thread + have completed. Similarly, the `tcgen05.wait` causes the executing + thread to block until all prior `tcgen05.st` operations issued by the + executing thread have completed. + [For more information refer PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait) + }]; + + let arguments = (ins Tcgen05WaitKindAttr:$kind); + let assemblyFormat = "$kind attr-dict"; + + string llvmBuilder = [{ + auto id = ($kind == NVVM::Tcgen05WaitKind::LOAD) + ? llvm::Intrinsic::nvvm_tcgen05_wait_ld + : llvm::Intrinsic::nvvm_tcgen05_wait_st; + createIntrinsicCall(builder, id); + }]; +} + +def NVVM_Tcgen05CommitOp : NVVM_Op<"tcgen05.commit"> { + let summary = "Tcgen05 commit operations"; + let description = [{ + The `tcgen05.commit` makes the mbarrier object, specified by + the operand `addr`, track the completion of all the prior + async-tcgen05 operations initiated by the executing thread. + The multicast variants allow signaling on the mbarrier objects + of multiple CTAs within the cluster. Operand `multicastMask`, + when present, specifies the destination CTAs in the cluster such + that each bit position in the 16-bit `multicastMask` operand + corresponds to the `nvvm.read.ptx.sreg.ctaid` of the destination CTA. + [For more information refer PTX ISA] + (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit) + }]; + + let arguments = (ins + AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr, + Optional:$multicastMask, + DefaultValuedAttr:$group); + + let assemblyFormat = [{ + $addr (`,` `multicast_mask` `=` $multicastMask^)? + attr-dict `:` type(operands) + }]; + + let extraClassDeclaration = [{ + static llvm::Intrinsic::ID + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::SmallVector &args); + }]; + + string llvmBuilder = [{ + llvm::SmallVector args; + auto id = NVVM::Tcgen05CommitOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, args); + createIntrinsicCall(builder, id, args); + }]; +} + //===----------------------------------------------------------------------===// // NVVM target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 241b25c6caf12..62f0c21338111 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -1284,6 +1284,36 @@ llvm::Intrinsic::ID Tcgen05DeallocOp::getIntrinsicIDAndArgs( return id; } +#define TCGEN05_COMMIT_IMPL(cg, is_shared, mc) \ + is_shared ? llvm::Intrinsic::nvvm_tcgen05_commit##mc##_shared##_##cg \ + : llvm::Intrinsic::nvvm_tcgen05_commit##mc##_##cg + +#define GET_TCGEN05_COMMIT_ID(cta_group, is_shared, has_mc) \ + has_mc ? TCGEN05_COMMIT_IMPL(cta_group, is_shared, _mc) \ + : TCGEN05_COMMIT_IMPL(cta_group, is_shared, ) + +llvm::Intrinsic::ID +Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op, + LLVM::ModuleTranslation &mt, + llvm::SmallVector &args) { + auto curOp = cast(op); + unsigned AS = llvm::cast(curOp.getAddr().getType()) + .getAddressSpace(); + bool isShared = AS == NVVMMemorySpace::kSharedMemorySpace; + bool hasMulticast = curOp.getMulticastMask() ? true : false; + bool is2CTAMode = curOp.getGroup() == Tcgen05GroupKind::CTA_2; + + auto id = is2CTAMode ? GET_TCGEN05_COMMIT_ID(cg2, isShared, hasMulticast) + : GET_TCGEN05_COMMIT_ID(cg1, isShared, hasMulticast); + + // Fill the Intrinsic Args + args.push_back(mt.lookupValue(curOp.getAddr())); + if (hasMulticast) + args.push_back(mt.lookupValue(curOp.getMulticastMask())); + + return id; +} + /// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might /// have ConstantRangeAttr. static void nvvmInferResultRanges(Operation *op, Value result, diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir new file mode 100644 index 0000000000000..7536a4567e34e --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-barriers.mlir @@ -0,0 +1,56 @@ +// RUN: mlir-opt -split-input-file -verify-diagnostics %s +// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM + +// CHECK-LABEL: @llvm_nvvm_tcgen05_fence +llvm.func @llvm_nvvm_tcgen05_fence() { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.before.thread.sync() + nvvm.tcgen05.fence #nvvm.tcgen05_fence + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.fence.after.thread.sync() + nvvm.tcgen05.fence #nvvm.tcgen05_fence + + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_wait +llvm.func @llvm_nvvm_tcgen05_wait() { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.ld() + nvvm.tcgen05.wait #nvvm.tcgen05_wait + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.wait.st() + nvvm.tcgen05.wait #nvvm.tcgen05_wait + + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_commit_generic +llvm.func @llvm_nvvm_tcgen05_commit_generic(%barrier : !llvm.ptr, %cta_mask : i16) { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.cg1(ptr %{{.*}}) + nvvm.tcgen05.commit %barrier : !llvm.ptr + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.cg2(ptr %{{.*}}) + nvvm.tcgen05.commit %barrier {group = #nvvm.tcgen05_group} : !llvm.ptr + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %{{.*}}, i16 %{{.*}}) + nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask : !llvm.ptr, i16 + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %{{.*}}, i16 %{{.*}}) + nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask {group = #nvvm.tcgen05_group} : !llvm.ptr, i16 + llvm.return +} + +// CHECK-LABEL: @llvm_nvvm_tcgen05_commit_shared +llvm.func @llvm_nvvm_tcgen05_commit_shared(%barrier : !llvm.ptr<3>, %cta_mask : i16) { + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %{{.*}}) + nvvm.tcgen05.commit %barrier : !llvm.ptr<3> + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %{{.*}}) + nvvm.tcgen05.commit %barrier {group = #nvvm.tcgen05_group} : !llvm.ptr<3> + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %{{.*}}, i16 %{{.*}}) + nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask : !llvm.ptr<3>, i16 + + // CHECK-LLVM: call void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %{{.*}}, i16 %{{.*}}) + nvvm.tcgen05.commit %barrier, multicast_mask = %cta_mask {group = #nvvm.tcgen05_group} : !llvm.ptr<3>, i16 + llvm.return +} From 69b8cf4f0621b359b487ad41887c20984be53a34 Mon Sep 17 00:00:00 2001 From: vporpo Date: Sat, 8 Feb 2025 08:34:18 -0800 Subject: [PATCH 063/293] [SandboxVec][BottomUpVec] Add cost estimation and tr-accept-or-revert pass (#126325) The TransactionAcceptOrRevert pass is the final pass in the Sandbox Vectorizer's default pass pipeline. It's job is to check the cost before/after vectorization and accept or revert the IR to its original state. Since we are now starting the transaction in BottomUpVec, tests that run a custom pipeline need to accept the transaction. This is done with the help of the TransactionAlwaysAccept pass (tr-accept). --- llvm/include/llvm/SandboxIR/Tracker.h | 7 +- .../Passes/TransactionAcceptOrRevert.h | 30 ++++++ .../Passes/TransactionAlwaysAccept.h | 34 +++++++ llvm/lib/SandboxIR/Tracker.cpp | 3 +- llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + .../SandboxVectorizer/DependencyGraph.cpp | 9 ++ .../SandboxVectorizer/Passes/BottomUpVec.cpp | 20 +++- .../SandboxVectorizer/Passes/PassRegistry.def | 2 + .../Passes/TransactionAcceptOrRevert.cpp | 37 ++++++++ .../SandboxVectorizer/SandboxVectorizer.cpp | 5 +- .../SandboxVectorizerPassBuilder.cpp | 2 + .../SandboxVectorizer/X86/simple_cost_test.ll | 91 +++++++++++++++++++ .../SandboxVectorizer/bottomup_basic.ll | 2 +- .../SandboxVectorizer/bottomup_seed_slice.ll | 2 +- .../bottomup_seed_slice_pow2.ll | 4 +- .../Transforms/SandboxVectorizer/cross_bbs.ll | 2 +- .../default_pass_pipeline.ll | 2 + .../test/Transforms/SandboxVectorizer/pack.ll | 2 +- .../SandboxVectorizer/repeated_instrs.ll | 2 +- .../Transforms/SandboxVectorizer/scheduler.ll | 2 +- .../SandboxVectorizer/special_opcodes.ll | 2 +- llvm/unittests/SandboxIR/TrackerTest.cpp | 6 ++ 22 files changed, 249 insertions(+), 18 deletions(-) create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.h create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAlwaysAccept.h create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.cpp create mode 100644 llvm/test/Transforms/SandboxVectorizer/X86/simple_cost_test.ll diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index 9a031f3270837..f7b469965eae8 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -440,8 +440,9 @@ class ShuffleVectorSetMask final : public IRChangeBase { class Tracker { public: enum class TrackerState { - Disabled, ///> Tracking is disabled - Record, ///> Tracking changes + Disabled, ///> Tracking is disabled + Record, ///> Tracking changes + Reverting, ///> Reverting changes }; private: @@ -473,6 +474,8 @@ class Tracker { ~Tracker(); Context &getContext() const { return Ctx; } + /// \Returns true if there are no changes tracked. + bool empty() const { return Changes.empty(); } /// Record \p Change and take ownership. This is the main function used to /// track Sandbox IR changes. void track(std::unique_ptr &&Change) { diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.h new file mode 100644 index 0000000000000..fce9cc0c1bde7 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.h @@ -0,0 +1,30 @@ +//===- TransactionAcceptOrRevert.h ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a region pass that checks the region cost before/after vectorization +// and accepts the state of Sandbox IR if the cost is better, or otherwise +// reverts it. +// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_TRANSACTIONACCEPTORREVERT_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_TRANSACTIONACCEPTORREVERT_H + +#include "llvm/SandboxIR/Pass.h" +#include "llvm/SandboxIR/Region.h" + +namespace llvm::sandboxir { + +class TransactionAcceptOrRevert : public RegionPass { +public: + TransactionAcceptOrRevert() : RegionPass("tr-accept-or-revert") {} + bool runOnRegion(Region &Rgn, const Analyses &A) final; +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_TRANSACTIONACCEPTORREVERT_H diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAlwaysAccept.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAlwaysAccept.h new file mode 100644 index 0000000000000..ed6cf1bf7cf51 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAlwaysAccept.h @@ -0,0 +1,34 @@ +//===- TransactionAlwaysAccept.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a region pass that always accepts the transaction without checking +// its cost. This is mainly used as a final pass in lit tests. +// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_TRANSACTIONALWAYSACCEPT_H +#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_TRANSACTIONALWAYSACCEPT_H + +#include "llvm/SandboxIR/Pass.h" +#include "llvm/SandboxIR/Region.h" + +namespace llvm::sandboxir { + +class TransactionAlwaysAccept : public RegionPass { +public: + TransactionAlwaysAccept() : RegionPass("tr-accept") {} + bool runOnRegion(Region &Rgn, const Analyses &A) final { + auto &Tracker = Rgn.getContext().getTracker(); + bool HasChanges = !Tracker.empty(); + Tracker.accept(); + return HasChanges; + } +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_TRANSACTIONALWAYSACCEPT_H diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index 5fa9f181055ca..4fa9e11ae0d4e 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -347,13 +347,14 @@ void Tracker::save() { void Tracker::revert() { assert(State == TrackerState::Record && "Forgot to save()!"); - State = TrackerState::Disabled; + State = TrackerState::Reverting; for (auto &Change : reverse(Changes)) Change->revert(*this); Changes.clear(); #if !defined(NDEBUG) && defined(EXPENSIVE_CHECKS) SnapshotChecker.expectNoDiff(); #endif + State = TrackerState::Disabled; } void Tracker::accept() { diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index e5fabd318b82c..872e055294d55 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_component_library(LLVMVectorize SandboxVectorizer/Legality.cpp SandboxVectorizer/Passes/BottomUpVec.cpp SandboxVectorizer/Passes/RegionsFromMetadata.cpp + SandboxVectorizer/Passes/TransactionAcceptOrRevert.cpp SandboxVectorizer/SandboxVectorizer.cpp SandboxVectorizer/SandboxVectorizerPassBuilder.cpp SandboxVectorizer/Scheduler.cpp diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp index 2680667afc4de..06a5e3bed7f03 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp @@ -368,6 +368,9 @@ MemDGNode *DependencyGraph::getMemDGNodeAfter(DGNode *N, bool IncludingN, } void DependencyGraph::notifyCreateInstr(Instruction *I) { + if (Ctx->getTracker().getState() == Tracker::TrackerState::Reverting) + // We don't maintain the DAG while reverting. + return; // Nothing to do if the node is not in the focus range of the DAG. if (!(DAGInterval.contains(I) || DAGInterval.touches(I))) return; @@ -405,6 +408,9 @@ void DependencyGraph::notifyCreateInstr(Instruction *I) { } void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) { + if (Ctx->getTracker().getState() == Tracker::TrackerState::Reverting) + // We don't maintain the DAG while reverting. + return; // NOTE: This function runs before `I` moves to its new destination. BasicBlock *BB = To.getNodeParent(); assert(!(To != BB->end() && &*To == I->getNextNode()) && @@ -472,6 +478,9 @@ void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) { } void DependencyGraph::notifyEraseInstr(Instruction *I) { + if (Ctx->getTracker().getState() == Tracker::TrackerState::Reverting) + // We don't maintain the DAG while reverting. + return; // Update the MemDGNode chain if this is a memory node. if (auto *MemN = dyn_cast_or_null(getNodeOrNull(I))) { auto *PrevMemN = getMemDGNodeBefore(MemN, /*IncludingN=*/false); diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 6f65657d29790..507d163240127 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -12,6 +12,7 @@ #include "llvm/SandboxIR/Function.h" #include "llvm/SandboxIR/Instruction.h" #include "llvm/SandboxIR/Module.h" +#include "llvm/SandboxIR/Region.h" #include "llvm/SandboxIR/Utils.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h" @@ -448,13 +449,24 @@ bool BottomUpVec::runOnFunction(Function &F, const Analyses &A) { assert(SeedSlice.size() >= 2 && "Should have been rejected!"); - // TODO: If vectorization succeeds, run the RegionPassManager on the - // resulting region. - // TODO: Refactor to remove the unnecessary copy to SeedSliceVals. SmallVector SeedSliceVals(SeedSlice.begin(), SeedSlice.end()); - Change |= tryVectorize(SeedSliceVals); + // Create an empty region. Instructions get added to the region + // automatically by the callbacks. + auto &Ctx = F.getContext(); + Region Rgn(Ctx, A.getTTI()); + // Save the state of the IR before we make any changes. The + // transaction gets accepted/reverted by the tr-accept-or-revert pass. + Ctx.save(); + // Try to vectorize starting from the seed slice. The returned value + // is true if we found vectorizable code and generated some vector + // code for it. It does not mean that the code is profitable. + bool VecSuccess = tryVectorize(SeedSliceVals); + if (VecSuccess) + // WARNING: All passes should return false, except those that + // accept/revert the state. + Change |= RPM.runOnRegion(Rgn, A); } } } diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def index 0dc72842f1abe..f3aa12729860f 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def @@ -19,6 +19,8 @@ REGION_PASS("null", ::llvm::sandboxir::NullPass) REGION_PASS("print-instruction-count", ::llvm::sandboxir::PrintInstructionCount) +REGION_PASS("tr-accept", ::llvm::sandboxir::TransactionAlwaysAccept) +REGION_PASS("tr-accept-or-revert", ::llvm::sandboxir::TransactionAcceptOrRevert) #undef REGION_PASS diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.cpp new file mode 100644 index 0000000000000..874390ba2daae --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.cpp @@ -0,0 +1,37 @@ +//===- TransactionAcceptOrRevert.cpp - Check cost and accept/revert region ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/InstructionCost.h" + +namespace llvm { + +static cl::opt CostThreshold("sbvec-cost-threshold", cl::init(0), + cl::Hidden, + cl::desc("Vectorization cost threshold.")); + +namespace sandboxir { + +bool TransactionAcceptOrRevert::runOnRegion(Region &Rgn, const Analyses &A) { + const auto &SB = Rgn.getScoreboard(); + InstructionCost CostAfterMinusBefore = SB.getAfterCost() - SB.getBeforeCost(); + // TODO: Print costs / write to remarks. + auto &Tracker = Rgn.getContext().getTracker(); + if (CostAfterMinusBefore < -CostThreshold) { + bool HasChanges = !Tracker.empty(); + Tracker.accept(); + return HasChanges; + } + // Revert the IR. + Rgn.getContext().getTracker().revert(); + return false; +} + +} // namespace sandboxir +} // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index 798a0ad915375..b233d35212f94 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -31,9 +31,10 @@ static cl::opt UserDefinedPassPipeline( SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") { if (UserDefinedPassPipeline == DefaultPipelineMagicStr) { - // TODO: Add region passes to the default pipeline. + // TODO: Add passes to the default pipeline. It currently contains: + // - the bottom-up-vectorizer pass FPM.setPassPipeline( - "bottom-up-vec<>", + "bottom-up-vec", sandboxir::SandboxVectorizerPassBuilder::createFunctionPass); } else { // Create the user-defined pipeline. diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp index 5ecf7b2ed0d25..0c1ab55e91a5c 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp @@ -4,6 +4,8 @@ #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.h" +#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAlwaysAccept.h" namespace llvm::sandboxir { diff --git a/llvm/test/Transforms/SandboxVectorizer/X86/simple_cost_test.ll b/llvm/test/Transforms/SandboxVectorizer/X86/simple_cost_test.ll new file mode 100644 index 0000000000000..f1df52bd88ad7 --- /dev/null +++ b/llvm/test/Transforms/SandboxVectorizer/X86/simple_cost_test.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sandbox-vectorizer -mtriple=x86_64-- -mattr=+sse4.1 %s -S -sbvec-cost-threshold=0 | FileCheck %s --check-prefix=THRESHOLD_0 +; RUN: opt -passes=sandbox-vectorizer -mtriple=x86_64-- -mattr=+sse4.1 %s -S -sbvec-cost-threshold=99 | FileCheck %s --check-prefix=THRESHOLD_99 + +define void @simple_cost_test(ptr %ptr) { +; THRESHOLD_0-LABEL: define void @simple_cost_test( +; THRESHOLD_0-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; THRESHOLD_0-NEXT: [[PTR0:%.*]] = getelementptr double, ptr [[PTR]], i32 0 +; THRESHOLD_0-NEXT: [[VECL:%.*]] = load <2 x double>, ptr [[PTR0]], align 8, !sandboxvec [[META0:![0-9]+]] +; THRESHOLD_0-NEXT: store <2 x double> [[VECL]], ptr [[PTR0]], align 8, !sandboxvec [[META0]] +; THRESHOLD_0-NEXT: ret void +; +; THRESHOLD_99-LABEL: define void @simple_cost_test( +; THRESHOLD_99-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; THRESHOLD_99-NEXT: [[PTR0:%.*]] = getelementptr double, ptr [[PTR]], i32 0 +; THRESHOLD_99-NEXT: [[PTR1:%.*]] = getelementptr double, ptr [[PTR]], i32 1, !sandboxvec [[META0:![0-9]+]] +; THRESHOLD_99-NEXT: [[LD0:%.*]] = load double, ptr [[PTR0]], align 8, !sandboxvec [[META0]] +; THRESHOLD_99-NEXT: [[LD1:%.*]] = load double, ptr [[PTR1]], align 8, !sandboxvec [[META0]] +; THRESHOLD_99-NEXT: store double [[LD0]], ptr [[PTR0]], align 8, !sandboxvec [[META0]] +; THRESHOLD_99-NEXT: store double [[LD1]], ptr [[PTR1]], align 8, !sandboxvec [[META0]] +; THRESHOLD_99-NEXT: ret void +; + %ptr0 = getelementptr double, ptr %ptr, i32 0 + %ptr1 = getelementptr double, ptr %ptr, i32 1 + %ld0 = load double, ptr %ptr0 + %ld1 = load double, ptr %ptr1 + store double %ld0, ptr %ptr0 + store double %ld1, ptr %ptr1 + ret void +} + +define void @pack_cost_test_(ptr %ptr) { +; THRESHOLD_0-LABEL: define void @pack_cost_test_( +; THRESHOLD_0-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; THRESHOLD_0-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 +; THRESHOLD_0-NEXT: [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1 +; THRESHOLD_0-NEXT: [[LD0:%.*]] = load float, ptr [[PTR0]], align 4 +; THRESHOLD_0-NEXT: [[LD1:%.*]] = load float, ptr [[PTR1]], align 4 +; THRESHOLD_0-NEXT: [[PACK4:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0, !sandboxvec [[META1:![0-9]+]] +; THRESHOLD_0-NEXT: [[PACK5:%.*]] = insertelement <4 x float> [[PACK4]], float [[LD1]], i32 1, !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: [[PACK6:%.*]] = insertelement <4 x float> [[PACK5]], float [[LD0]], i32 2, !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: [[PACK7:%.*]] = insertelement <4 x float> [[PACK6]], float [[LD1]], i32 3, !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: [[PACK:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0, !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: [[PACK1:%.*]] = insertelement <4 x float> [[PACK]], float [[LD1]], i32 1, !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: [[PACK2:%.*]] = insertelement <4 x float> [[PACK1]], float [[LD0]], i32 2, !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: [[PACK3:%.*]] = insertelement <4 x float> [[PACK2]], float [[LD1]], i32 3, !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: [[VEC:%.*]] = fmul <4 x float> [[PACK3]], [[PACK7]], !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: store <4 x float> [[VEC]], ptr [[PTR0]], align 4, !sandboxvec [[META1]] +; THRESHOLD_0-NEXT: ret void +; +; THRESHOLD_99-LABEL: define void @pack_cost_test_( +; THRESHOLD_99-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] { +; THRESHOLD_99-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 +; THRESHOLD_99-NEXT: [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1 +; THRESHOLD_99-NEXT: [[PTR2:%.*]] = getelementptr float, ptr [[PTR]], i32 2, !sandboxvec [[META1:![0-9]+]] +; THRESHOLD_99-NEXT: [[PTR3:%.*]] = getelementptr float, ptr [[PTR]], i32 3, !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: [[LD0:%.*]] = load float, ptr [[PTR0]], align 4 +; THRESHOLD_99-NEXT: [[LD1:%.*]] = load float, ptr [[PTR1]], align 4 +; THRESHOLD_99-NEXT: [[MUL0:%.*]] = fmul float [[LD0]], [[LD0]], !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: [[MUL1:%.*]] = fmul float [[LD1]], [[LD1]], !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: [[MUL2:%.*]] = fmul float [[LD0]], [[LD0]], !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: [[MUL3:%.*]] = fmul float [[LD1]], [[LD1]], !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: store float [[MUL0]], ptr [[PTR0]], align 4, !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: store float [[MUL1]], ptr [[PTR1]], align 4, !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: store float [[MUL2]], ptr [[PTR2]], align 4, !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: store float [[MUL3]], ptr [[PTR3]], align 4, !sandboxvec [[META1]] +; THRESHOLD_99-NEXT: ret void +; + %ptr0 = getelementptr float, ptr %ptr, i32 0 + %ptr1 = getelementptr float, ptr %ptr, i32 1 + %ptr2 = getelementptr float, ptr %ptr, i32 2 + %ptr3 = getelementptr float, ptr %ptr, i32 3 + %ld0 = load float, ptr %ptr0 + %ld1 = load float, ptr %ptr1 + %mul0 = fmul float %ld0, %ld0 + %mul1 = fmul float %ld1, %ld1 + %mul2 = fmul float %ld0, %ld0 + %mul3 = fmul float %ld1, %ld1 + store float %mul0, ptr %ptr0 + store float %mul1, ptr %ptr1 + store float %mul2, ptr %ptr2 + store float %mul3, ptr %ptr3 + ret void +} +;. +; THRESHOLD_0: [[META0]] = distinct !{!"sandboxregion"} +; THRESHOLD_0: [[META1]] = distinct !{!"sandboxregion"} +;. +; THRESHOLD_99: [[META0]] = distinct !{!"sandboxregion"} +; THRESHOLD_99: [[META1]] = distinct !{!"sandboxregion"} +;. diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll index ee5a3a514b3c5..ee8592c04b62c 100644 --- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll +++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s define void @store_load(ptr %ptr) { ; CHECK-LABEL: define void @store_load( diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll index 8459c3addaa83..202b5a6fbd6c9 100644 --- a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll +++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s declare void @foo() diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll index e186d5fa86e4a..f1c6e3297d79c 100644 --- a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll +++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2=false -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s --check-prefix=POW2 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2=true -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s --check-prefix=NON-POW2 +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2=false -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s --check-prefix=POW2 +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2=true -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s --check-prefix=NON-POW2 define void @pow2(ptr %ptr, float %val) { ; POW2-LABEL: define void @pow2( diff --git a/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll b/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll index 6ec31060d7e0f..ff1604173c317 100644 --- a/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll +++ b/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s define void @cross_bbs(ptr %ptr) { ; CHECK-LABEL: define void @cross_bbs( diff --git a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll index 1d7be43336c87..10de4338caf23 100644 --- a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll +++ b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll @@ -4,8 +4,10 @@ ; This checks the default pass pipeline for the sandbox vectorizer. define void @pipeline() { +; CHECK: fpm ; CHECK: bottom-up-vec ; CHECK: rpm +; CHECK: tr-accept-or-revert ; CHECK-EMPTY: ret void } diff --git a/llvm/test/Transforms/SandboxVectorizer/pack.ll b/llvm/test/Transforms/SandboxVectorizer/pack.ll index ec6e61a90c0fb..da41036e3a58b 100644 --- a/llvm/test/Transforms/SandboxVectorizer/pack.ll +++ b/llvm/test/Transforms/SandboxVectorizer/pack.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s define void @pack_constants(ptr %ptr) { ; CHECK-LABEL: define void @pack_constants( diff --git a/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll b/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll index 6026e92ef9a82..25d9d79154d35 100644 --- a/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll +++ b/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s define i32 @repeated_splat(ptr %ptr, i32 %v) #0 { ; CHECK-LABEL: define i32 @repeated_splat( diff --git a/llvm/test/Transforms/SandboxVectorizer/scheduler.ll b/llvm/test/Transforms/SandboxVectorizer/scheduler.ll index 847c978aa4912..92a78a979192b 100644 --- a/llvm/test/Transforms/SandboxVectorizer/scheduler.ll +++ b/llvm/test/Transforms/SandboxVectorizer/scheduler.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s ; This used to crash because the newly added pack instructions would not update ; the DAG and scheduler, leading to def-after-use. diff --git a/llvm/test/Transforms/SandboxVectorizer/special_opcodes.ll b/llvm/test/Transforms/SandboxVectorizer/special_opcodes.ll index fe3a2067d481d..e8fe8b4fa88e3 100644 --- a/llvm/test/Transforms/SandboxVectorizer/special_opcodes.ll +++ b/llvm/test/Transforms/SandboxVectorizer/special_opcodes.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<>" %s -S | FileCheck %s +; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec" %s -S | FileCheck %s ; This file includes tests for opcodes that need special checks. diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index 4eedab124bfa0..9c18247b6b96d 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -52,6 +52,9 @@ define void @foo(ptr %ptr) { auto *F = Ctx.createFunction(&LLVMF); auto *BB = &*F->begin(); auto &Tracker = Ctx.getTracker(); + // Check empty(). + EXPECT_TRUE(Ctx.getTracker().empty()); + Tracker.save(); auto It = BB->begin(); auto *Gep0 = &*It++; @@ -65,6 +68,9 @@ define void @foo(ptr %ptr) { EXPECT_EQ(St->getOperand(1), Gep1); EXPECT_EQ(Ld->getOperand(0), Gep1); + // Check empty(). + EXPECT_FALSE(Ctx.getTracker().empty()); + Ctx.getTracker().revert(); EXPECT_NE(St->getOperand(0), Ld); EXPECT_EQ(St->getOperand(1), Gep0); From 40ce8fd8436d7b52b31cb8174fe442c9c1cae7a0 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Sat, 8 Feb 2025 16:35:01 +0000 Subject: [PATCH 064/293] [gn build] Port 69b8cf4f0621 --- llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index f59b6446f0dea..433a7f43bb780 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -19,6 +19,7 @@ static_library("Vectorize") { "SandboxVectorizer/Legality.cpp", "SandboxVectorizer/Passes/BottomUpVec.cpp", "SandboxVectorizer/Passes/RegionsFromMetadata.cpp", + "SandboxVectorizer/Passes/TransactionAcceptOrRevert.cpp", "SandboxVectorizer/SandboxVectorizer.cpp", "SandboxVectorizer/SandboxVectorizerPassBuilder.cpp", "SandboxVectorizer/Scheduler.cpp", From 5c8c2b3db54395073e3183f89167156df29dff61 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Sat, 8 Feb 2025 18:02:54 +0100 Subject: [PATCH 065/293] [Flang] Rename libFortranRuntime.a to libflang_rt.runtime.a (#122341) Following the conclusion of the [RFC](https://discourse.llvm.org/t/rfc-names-for-flang-rt-libraries/84321), rename Flang's runtime libraries as follows: * libFortranRuntime.(a|so) to libflang_rt.runtime.(a|so) * libFortranFloat128Math.a to libflang_rt.quadmath.a * libCufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR}.(a|so) to libflang_rt.cuda_${CUDAToolkit_VERSION_MAJOR}.(a|so) This follows the same naming scheme as Compiler-RT libraries (`libclang_rt.${component}.(a|so)`). It provides some consistency between Flang's runtime libraries for current and potential future library components. --- clang/lib/Driver/ToolChains/CommonArgs.cpp | 6 +- clang/lib/Driver/ToolChains/Flang.cpp | 22 +++++--- flang/CMakeLists.txt | 2 +- flang/cmake/modules/AddFlang.cmake | 2 +- flang/docs/FlangDriver.md | 6 +- flang/docs/GettingStarted.md | 8 +-- flang/docs/OpenACC-descriptor-management.md | 2 +- flang/docs/Real16MathSupport.md | 4 +- flang/docs/ReleaseNotes.md | 7 +++ .../ExternalHelloWorld/CMakeLists.txt | 2 +- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 4 +- flang/runtime/CMakeLists.txt | 55 ++++++++++++------- flang/runtime/CUDA/CMakeLists.txt | 6 +- flang/runtime/Float128Math/CMakeLists.txt | 14 ++--- flang/runtime/time-intrinsic.cpp | 2 +- flang/runtime/tools.h | 2 +- flang/test/CMakeLists.txt | 8 ++- .../test/Driver/gcc-toolchain-install-dir.f90 | 2 +- flang/test/Driver/linker-flags.f90 | 34 ++++++------ .../test/Driver/msvc-dependent-lib-flags.f90 | 8 +-- flang/test/Driver/nostdlib.f90 | 2 +- flang/test/Runtime/no-cpp-dep.c | 2 +- flang/test/lit.cfg.py | 2 +- flang/tools/f18/CMakeLists.txt | 8 +-- flang/unittests/CMakeLists.txt | 2 +- flang/unittests/Evaluate/CMakeLists.txt | 4 +- flang/unittests/Runtime/CMakeLists.txt | 2 +- flang/unittests/Runtime/CUDA/CMakeLists.txt | 4 +- lld/COFF/MinGW.cpp | 2 +- 29 files changed, 128 insertions(+), 96 deletions(-) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index e0b5d003ebb13..61917db4d780d 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1321,7 +1321,7 @@ void tools::addOpenMPHostOffloadingArgs(const Compilation &C, /// Add Fortran runtime libs void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args, llvm::opt::ArgStringList &CmdArgs) { - // Link FortranRuntime + // Link flang_rt.runtime // These are handled earlier on Windows by telling the frontend driver to // add the correct libraries to link against as dependents in the object // file. @@ -1330,14 +1330,14 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args, F128LibName.consume_front_insensitive("lib"); if (!F128LibName.empty()) { bool AsNeeded = !TC.getTriple().isOSAIX(); - CmdArgs.push_back("-lFortranFloat128Math"); + CmdArgs.push_back("-lflang_rt.quadmath"); if (AsNeeded) addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/true); CmdArgs.push_back(Args.MakeArgString("-l" + F128LibName)); if (AsNeeded) addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/false); } - CmdArgs.push_back("-lFortranRuntime"); + CmdArgs.push_back("-lflang_rt.runtime"); addArchSpecificRPath(TC, Args, CmdArgs); // needs libexecinfo for backtrace functions diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index e7b68f4a8c60a..591003f56e8bb 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -346,11 +346,15 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs) { assert(TC.getTriple().isKnownWindowsMSVCEnvironment() && "can only add VS runtime library on Windows!"); - // if -fno-fortran-main has been passed, skip linking Fortran_main.a - if (TC.getTriple().isKnownWindowsMSVCEnvironment()) { - CmdArgs.push_back(Args.MakeArgString( - "--dependent-lib=" + TC.getCompilerRTBasename(Args, "builtins"))); - } + + // Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI + // should only depend on msv(u)crt. LLVM still emits libgcc/compiler-rt + // functions in some cases like 128-bit integer math (__udivti3, __modti3, + // __fixsfti, __floattidf, ...) that msvc does not support. We are injecting a + // dependency to Compiler-RT's builtin library where these are implemented. + CmdArgs.push_back(Args.MakeArgString( + "--dependent-lib=" + TC.getCompilerRTBasename(Args, "builtins"))); + unsigned RTOptionID = options::OPT__SLASH_MT; if (auto *rtl = Args.getLastArg(options::OPT_fms_runtime_lib_EQ)) { RTOptionID = llvm::StringSwitch(rtl->getValue()) @@ -364,26 +368,26 @@ static void processVSRuntimeLibrary(const ToolChain &TC, const ArgList &Args, case options::OPT__SLASH_MT: CmdArgs.push_back("-D_MT"); CmdArgs.push_back("--dependent-lib=libcmt"); - CmdArgs.push_back("--dependent-lib=FortranRuntime.static.lib"); + CmdArgs.push_back("--dependent-lib=flang_rt.runtime.static.lib"); break; case options::OPT__SLASH_MTd: CmdArgs.push_back("-D_MT"); CmdArgs.push_back("-D_DEBUG"); CmdArgs.push_back("--dependent-lib=libcmtd"); - CmdArgs.push_back("--dependent-lib=FortranRuntime.static_dbg.lib"); + CmdArgs.push_back("--dependent-lib=flang_rt.runtime.static_dbg.lib"); break; case options::OPT__SLASH_MD: CmdArgs.push_back("-D_MT"); CmdArgs.push_back("-D_DLL"); CmdArgs.push_back("--dependent-lib=msvcrt"); - CmdArgs.push_back("--dependent-lib=FortranRuntime.dynamic.lib"); + CmdArgs.push_back("--dependent-lib=flang_rt.runtime.dynamic.lib"); break; case options::OPT__SLASH_MDd: CmdArgs.push_back("-D_MT"); CmdArgs.push_back("-D_DEBUG"); CmdArgs.push_back("-D_DLL"); CmdArgs.push_back("--dependent-lib=msvcrtd"); - CmdArgs.push_back("--dependent-lib=FortranRuntime.dynamic_dbg.lib"); + CmdArgs.push_back("--dependent-lib=flang_rt.runtime.dynamic_dbg.lib"); break; } } diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index e6de8df5cef15..c6271f1856eb9 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -301,7 +301,7 @@ set(FLANG_DEFAULT_LINKER "" CACHE STRING "Default linker to use (linker name or absolute path, empty for platform default)") set(FLANG_DEFAULT_RTLIB "" CACHE STRING - "Default Fortran runtime library to use (\"libFortranRuntime\"), leave empty for platform default.") + "Default Fortran runtime library to use (\"libflang_rt.runtime\"), leave empty for platform default.") if (NOT(FLANG_DEFAULT_RTLIB STREQUAL "")) message(WARNING "Resetting Flang's default runtime library to use platform default.") diff --git a/flang/cmake/modules/AddFlang.cmake b/flang/cmake/modules/AddFlang.cmake index 1f178772067ed..c9f65eb73fef0 100644 --- a/flang/cmake/modules/AddFlang.cmake +++ b/flang/cmake/modules/AddFlang.cmake @@ -57,7 +57,7 @@ function(add_flang_library name) set(LIBTYPE SHARED) elseif(ARG_STATIC) # If BUILD_SHARED_LIBS and ARG_STATIC are both set, llvm_add_library prioritizes STATIC. - # This is required behavior for libFortranFloat128Math. + # This is required behavior for libflang_rt.quadmath. set(LIBTYPE STATIC) else() # Let llvm_add_library decide, taking BUILD_SHARED_LIBS into account. diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md index 5f960a954783d..97744f0bee069 100644 --- a/flang/docs/FlangDriver.md +++ b/flang/docs/FlangDriver.md @@ -175,18 +175,18 @@ like this: ``` $ flang -v -o example example.o -"/usr/bin/ld" [...] example.o [...] "-lFortranRuntime" [...] +"/usr/bin/ld" [...] example.o [...] "-lflang_rt.runtime" [...] ``` The automatically added libraries are: -* `FortranRuntime`: Provides most of the Flang runtime library. +* `flang_rt.runtime`: Provides most of the Flang runtime library. If the code is C/C++ based and invokes Fortran routines, one can either use Clang or Flang as the linker driver. If Clang is used, it will automatically all required runtime libraries needed by C++ (e.g., for STL) to the linker invocation. In this case, one has to explicitly provide the Fortran runtime library -`FortranRuntime`. An alternative is to use Flang to link. +`flang_rt.runtime`. An alternative is to use Flang to link. In this case, it may be required to explicitly supply C++ runtime libraries. On Darwin, the logical root where the system libraries are located (sysroot) diff --git a/flang/docs/GettingStarted.md b/flang/docs/GettingStarted.md index 1c85a6754b155..e422a31a0b402 100644 --- a/flang/docs/GettingStarted.md +++ b/flang/docs/GettingStarted.md @@ -216,7 +216,7 @@ cmake \ -DCMAKE_CUDA_COMPILER=clang \ -DCMAKE_CUDA_HOST_COMPILER=clang++ \ ../runtime/ -make -j FortranRuntime +make -j flang-rt ``` Note that the used version of `clang` must [support](https://releases.llvm.org/16.0.0/tools/clang/docs/ReleaseNotes.html#cuda-support) @@ -239,7 +239,7 @@ cmake \ -DCMAKE_CUDA_HOST_COMPILER=clang++ \ ../runtime/ -make -j FortranRuntime +make -j flang-rt ``` Note that `nvcc` might limit support to certain @@ -294,7 +294,7 @@ cmake \ -DFLANG_OMP_DEVICE_ARCHITECTURES="all" \ ../runtime/ -make -j FortranRuntime +make -j flang-rt ``` The result of the build is a "device-only" library, i.e. the host @@ -309,7 +309,7 @@ The same set of CMake variables works for Flang in-tree build. One may provide optional CMake variables to customize the build. Available options: * `-DFLANG_RUNTIME_F128_MATH_LIB=libquadmath`: enables build of - `FortranFloat128Math` library that provides `REAL(16)` math APIs + `flang_rt.quadmath` library that provides `REAL(16)` math APIs for intrinsics such as `SIN`, `COS`, etc. GCC `libquadmath`'s header file `quadmath.h` must be available to the build compiler. [More details](Real16MathSupport.md). diff --git a/flang/docs/OpenACC-descriptor-management.md b/flang/docs/OpenACC-descriptor-management.md index 52d00ae4daef8..008c57937e23b 100644 --- a/flang/docs/OpenACC-descriptor-management.md +++ b/flang/docs/OpenACC-descriptor-management.md @@ -427,7 +427,7 @@ The implementation's behavior may be described as (OpenACC 2.7.2): All the "is-present" checks and the data actions for the auxiliary pointers must be performed atomically with regards to the present counters bookkeeping. -The API relies on the primitives provided by `liboffload`, so it is provided by a new F18 runtime library, e.g. `FortranOffloadRuntime`, that depends on `FortranRuntime` and `liboffload`. The F18 driver adds `FortranOffloadRuntime` for linking under `-fopenacc`/`-fopenmp` (and maybe additional switches like `-fopenmp-targets`). +The API relies on the primitives provided by `liboffload`, so it is provided by a new F18 runtime library, e.g. `FortranOffloadRuntime`, that depends on `flang_rt.runtime` and `liboffload`. The F18 driver adds `FortranOffloadRuntime` for linking under `-fopenacc`/`-fopenmp` (and maybe additional switches like `-fopenmp-targets`). ## TODOs: diff --git a/flang/docs/Real16MathSupport.md b/flang/docs/Real16MathSupport.md index 21482c7be21af..93492c8b767c3 100644 --- a/flang/docs/Real16MathSupport.md +++ b/flang/docs/Real16MathSupport.md @@ -12,9 +12,9 @@ To support most `REAL(16)` (i.e. 128-bit float) math intrinsics Flang relies on third-party libraries providing the implementation. `-DFLANG_RUNTIME_F128_MATH_LIB=libquadmath` CMake option can be used -to build `FortranFloat128Math` library that has unresolved references +to build `libflang_rt.quadmath` library that has unresolved references to GCC `libquadmath` library. A Flang driver built with this option -will automatically link `FortranFloat128Math` and `libquadmath` libraries +will automatically link `libflang_rt.quadmath` and `libquadmath` libraries to any Fortran program. This implies that `libquadmath` library has to be available in the standard library paths, so that linker can find it. The `libquadmath` library installation into Flang project diff --git a/flang/docs/ReleaseNotes.md b/flang/docs/ReleaseNotes.md index f0c956281915f..387d4b2e62e0f 100644 --- a/flang/docs/ReleaseNotes.md +++ b/flang/docs/ReleaseNotes.md @@ -36,6 +36,13 @@ page](https://llvm.org/releases/). ## Build System Changes + * The FortranRuntime library has been renamed to `flang_rt.runtime`. + + * The FortranFloat128Math library has been renamed to `flang_rt.quadmath`. + + * The CufRuntime_cuda_${version} library has been renamed to + `flang_rt.cuda_${version}`. + ## New Issues Found diff --git a/flang/examples/ExternalHelloWorld/CMakeLists.txt b/flang/examples/ExternalHelloWorld/CMakeLists.txt index 042d4b6238ba4..b61948718a5e3 100644 --- a/flang/examples/ExternalHelloWorld/CMakeLists.txt +++ b/flang/examples/ExternalHelloWorld/CMakeLists.txt @@ -5,5 +5,5 @@ add_llvm_example(external-hello-world target_link_libraries(external-hello-world PRIVATE - FortranRuntime + flang_rt.runtime ) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 9b684520ec078..9a80e36efe837 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -809,8 +809,8 @@ prettyPrintIntrinsicName(fir::FirOpBuilder &builder, mlir::Location loc, // Generate a call to the Fortran runtime library providing // support for 128-bit float math. // On 'HAS_LDBL128' targets the implementation -// is provided by FortranRuntime, otherwise, it is done via -// FortranFloat128Math library. In the latter case the compiler +// is provided by flang_rt, otherwise, it is done via the +// libflang_rt.quadmath library. In the latter case the compiler // has to be built with FLANG_RUNTIME_F128_MATH_LIB to guarantee // proper linking actions in the driver. static mlir::Value genLibF128Call(fir::FirOpBuilder &builder, diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt index 3587ec078c47f..7cc720e2df9af 100644 --- a/flang/runtime/CMakeLists.txt +++ b/flang/runtime/CMakeLists.txt @@ -241,13 +241,13 @@ set(supported_files utf.cpp ) -enable_cuda_compilation(FortranRuntime "${supported_files}") +enable_cuda_compilation(flang_rt "${supported_files}") enable_omp_offload_compilation("${supported_files}") -if (NOT TARGET FortranFloat128Math) - # If FortranFloat128Math is not defined, then we are not building - # standalone FortranFloat128Math library. Instead, include - # the relevant sources into FortranRuntime itself. +if (NOT TARGET flang_rt.quadmath) + # If flang_rt.quadmath is not defined, then we are not building + # standalone flang_rt.quadmath library. Instead, include + # the relevant sources into flang_rt.runtime itself. # The information is provided via FortranFloat128MathILib # interface library. get_target_property(f128_sources @@ -275,7 +275,7 @@ if (NOT TARGET FortranFloat128Math) endif() if (NOT DEFINED MSVC) - add_flang_library(FortranRuntime + add_flang_library(flang_rt.runtime ${sources} LINK_LIBS ${linked_libraries} @@ -283,33 +283,36 @@ if (NOT DEFINED MSVC) INSTALL_WITH_TOOLCHAIN ) else() - add_flang_library(FortranRuntime + add_flang_library(flang_rt.runtime ${sources} LINK_LIBS ${linked_libraries} ) set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded) - add_flang_library(FortranRuntime.static ${sources} + add_flang_library(flang_rt.runtime.static ${sources} INSTALL_WITH_TOOLCHAIN) - set_target_properties(FortranRuntime.static PROPERTIES FOLDER "Flang/Runtime Libraries") + set_target_properties(flang_rt.runtime.static PROPERTIES FOLDER "Flang/Runtime Libraries") set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL) - add_flang_library(FortranRuntime.dynamic ${sources} + add_flang_library(flang_rt.runtime.dynamic ${sources} INSTALL_WITH_TOOLCHAIN) - set_target_properties(FortranRuntime.dynamic PROPERTIES FOLDER "Flang/Runtime Libraries") + set_target_properties(flang_rt.runtime.dynamic PROPERTIES FOLDER "Flang/Runtime Libraries") set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebug) - add_flang_library(FortranRuntime.static_dbg ${sources} + add_flang_library(flang_rt.runtime.static_dbg ${sources} INSTALL_WITH_TOOLCHAIN) - set_target_properties(FortranRuntime.static_dbg PROPERTIES FOLDER "Flang/Runtime Libraries") + set_target_properties(flang_rt.runtime.static_dbg PROPERTIES FOLDER "Flang/Runtime Libraries") set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebugDLL) - add_flang_library(FortranRuntime.dynamic_dbg ${sources} + add_flang_library(flang_rt.runtime.dynamic_dbg ${sources} INSTALL_WITH_TOOLCHAIN) - set_target_properties(FortranRuntime.dynamic_dbg PROPERTIES FOLDER "Flang/Runtime Libraries") - add_dependencies(FortranRuntime FortranRuntime.static FortranRuntime.dynamic - FortranRuntime.static_dbg FortranRuntime.dynamic_dbg) + set_target_properties(flang_rt.runtime.dynamic_dbg PROPERTIES FOLDER "Flang/Runtime Libraries") + add_dependencies(flang_rt.runtime + flang_rt.runtime.static + flang_rt.runtime.dynamic + flang_rt.runtime.static_dbg + flang_rt.runtime.dynamic_dbg) endif() -set_target_properties(FortranRuntime PROPERTIES FOLDER "Flang/Runtime Libraries") +set_target_properties(flang_rt.runtime PROPERTIES FOLDER "Flang/Runtime Libraries") -# If FortranRuntime is part of a Flang build (and not a separate build) then +# If flang_rt is part of a Flang build (and not a separate build) then # add dependency to make sure that Fortran runtime library is being built after # we have the Flang compiler available. This also includes the MODULE files # that compile when the 'flang' target is built. @@ -317,9 +320,21 @@ set_target_properties(FortranRuntime PROPERTIES FOLDER "Flang/Runtime Libraries" # TODO: This is a workaround and should be updated when runtime build procedure # is changed to a regular runtime build. See discussion in PR #95388. if (TARGET flang AND TARGET module_files) - add_dependencies(FortranRuntime flang module_files) + add_dependencies(flang_rt.runtime flang module_files) endif() if (FLANG_CUF_RUNTIME) add_subdirectory(CUDA) endif() + +# Compatibility targets. +add_custom_target(flang-rt) +add_dependencies(flang-rt flang_rt.runtime) +if (TARGET flang_rt.quadmath) + add_dependencies(flang-rt flang_rt.quadmath) +endif () +if (TARGET flang_rt.cuda_${CUDAToolkit_VERSION_MAJOR}) + add_dependencies(flang-rt flang_rt.cuda_${CUDAToolkit_VERSION_MAJOR}) +endif () +add_custom_target(FortranRuntime) +add_dependencies(FortranRuntime flang_rt.runtime) diff --git a/flang/runtime/CUDA/CMakeLists.txt b/flang/runtime/CUDA/CMakeLists.txt index bfbae58086c1f..1fd3bf22a83cf 100644 --- a/flang/runtime/CUDA/CMakeLists.txt +++ b/flang/runtime/CUDA/CMakeLists.txt @@ -8,10 +8,10 @@ include_directories(${CUDAToolkit_INCLUDE_DIRS}) -# libCufRuntime depends on a certain version of CUDA. To be able to have +# libflang_rt.cuda depends on a certain version of CUDA. To be able to have # multiple build of this library with different CUDA version, the version is # added to the library name. -set(CUFRT_LIBNAME CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR}) +set(CUFRT_LIBNAME flang_rt.cuda_${CUDAToolkit_VERSION_MAJOR}) add_flang_library(${CUFRT_LIBNAME} allocator.cpp @@ -33,6 +33,6 @@ endif() target_link_libraries(${CUFRT_LIBNAME} PRIVATE - FortranRuntime + flang_rt.runtime ${CUDA_RT_TARGET} ) diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt index 703f85fcaf8da..3c382d16a21cd 100644 --- a/flang/runtime/Float128Math/CMakeLists.txt +++ b/flang/runtime/Float128Math/CMakeLists.txt @@ -12,7 +12,7 @@ # It is distributed as a static library only. # Fortran programs/libraries that end up linking any of the provided # will have a dependency on the third-party library that is being -# used for building this FortranFloat128Math library. +# used for building this flang_rt.quadmath library. include(CheckLibraryExists) @@ -93,20 +93,20 @@ if (FLANG_RUNTIME_F128_MATH_LIB) ) endif() - add_flang_library(FortranFloat128Math STATIC INSTALL_WITH_TOOLCHAIN + add_flang_library(flang_rt.quadmath STATIC INSTALL_WITH_TOOLCHAIN ${sources}) if (DEFINED MSVC) set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded) - add_flang_library(FortranFloat128Math.static STATIC INSTALL_WITH_TOOLCHAIN + add_flang_library(flang_rt.quadmath.static STATIC INSTALL_WITH_TOOLCHAIN ${sources} ) set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebug) - add_flang_library(FortranFloat128Math.static_dbg STATIC INSTALL_WITH_TOOLCHAIN + add_flang_library(flang_rt.quadmath.static_dbg STATIC INSTALL_WITH_TOOLCHAIN ${sources} ) - add_dependencies(FortranFloat128Math FortranFloat128Math.static - FortranFloat128Math.static_dbg + add_dependencies(flang_rt.quadmath flang_rt.quadmath.static + flang_rt.quadmath.static_dbg ) endif() elseif (HAVE_LDBL_MANT_DIG_113) @@ -118,7 +118,7 @@ elseif (HAVE_LDBL_MANT_DIG_113) ) target_sources(FortranFloat128MathILib INTERFACE ${sources}) else() - message(FATAL_ERROR "FortranRuntime cannot build without libm") + message(FATAL_ERROR "flang_rt.quadmath cannot build without libm") endif() else() # We can use '__float128' version from libc, if it has them. diff --git a/flang/runtime/time-intrinsic.cpp b/flang/runtime/time-intrinsic.cpp index e6f6e81c7b50c..942604a92aaad 100644 --- a/flang/runtime/time-intrinsic.cpp +++ b/flang/runtime/time-intrinsic.cpp @@ -62,7 +62,7 @@ template double GetCpuTime(fallback_implementation) { #if defined __MINGW32__ // clock_gettime is implemented in the pthread library for MinGW. -// Using it here would mean that all programs that link libFortranRuntime are +// Using it here would mean that all programs that link libflang_rt are // required to also link to pthread. Instead, don't use the function. #undef CLOCKID_CPU_TIME #undef CLOCKID_ELAPSED_TIME diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h index facbd23161057..75544098d47ab 100644 --- a/flang/runtime/tools.h +++ b/flang/runtime/tools.h @@ -348,7 +348,7 @@ inline RT_API_ATTRS RESULT ApplyFloatingPointKind( if constexpr (HasCppTypeFor) { // If FUNC implemenation relies on FP math functions, // then we should not be here. The compiler should have - // generated a call to an entry in FortranFloat128Math + // generated a call to an entry in flang_rt.quadmath // library. if constexpr (!NEEDSMATH) { return FUNC<16>{}(std::forward(x)...); diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt index e398e0786147a..3fac8717e9bd9 100644 --- a/flang/test/CMakeLists.txt +++ b/flang/test/CMakeLists.txt @@ -75,7 +75,7 @@ set(FLANG_TEST_DEPENDS ) if (FLANG_INCLUDE_RUNTIME) - list(APPEND FLANG_TEST_DEPENDS FortranRuntime) + list(APPEND FLANG_TEST_DEPENDS flang_rt.runtime) endif () if (LLVM_ENABLE_PLUGINS AND NOT WIN32) @@ -124,3 +124,9 @@ if (DEFINED FLANG_TEST_TARGET_TRIPLE) "to use FLANG_TEST_TARGET_TRIPLE.") endif() endif() + +# Compatibility targets. +if (FLANG_INCLUDE_RUNTIME) + add_custom_target(check-flang-rt) + add_dependencies(check-flang-rt check-flang) +endif () diff --git a/flang/test/Driver/gcc-toolchain-install-dir.f90 b/flang/test/Driver/gcc-toolchain-install-dir.f90 index 5a073b0c51712..e195bdde6d2c9 100644 --- a/flang/test/Driver/gcc-toolchain-install-dir.f90 +++ b/flang/test/Driver/gcc-toolchain-install-dir.f90 @@ -1,5 +1,5 @@ !! Test that --gcc-toolchain and --gcc-install-dir options are working as expected. -!! It does not test cross-compiling (--sysroot), so crtbegin.o, libgcc/compiler-rt, libc, libFortranRuntime, etc. are not supposed to be affected. +!! It does not test cross-compiling (--sysroot), so crtbegin.o, libgcc/compiler-rt, libc, libflang_rt.runtime, etc. are not supposed to be affected. !! PREFIX is captured twice because the driver escapes backslashes (occuring in Windows paths) in the -### output, but not on the "Selected GCC installation:" line. ! RUN: %flang 2>&1 -### -v -o %t %s -no-integrated-as -fuse-ld=ld --target=i386-unknown-linux-gnu --gcc-install-dir=%S/Inputs/basic_cross_linux_tree/usr/lib/gcc/i386-unknown-linux-gnu/10.2.0 | FileCheck %s --check-prefix=CHECK-I386 diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90 index 16bd4c3ba8371..4e62a8c32d360 100644 --- a/flang/test/Driver/linker-flags.f90 +++ b/flang/test/Driver/linker-flags.f90 @@ -29,37 +29,37 @@ ! executable and may find the GNU linker from MinGW or Cygwin. ! UNIX-LABEL: "{{.*}}ld{{(\.exe)?}}" ! UNIX-SAME: "[[object_file]]" -! UNIX-F128NONE-NOT: FortranFloat128Math -! SOLARIS-F128NONE-NOT: FortranFloat128Math -! UNIX-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed" -! SOLARIS-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "-z" "ignore" "-lquadmath" "-z" "record" -! UNIX-SAME: "-lFortranRuntime" "-lm" +! UNIX-F128NONE-NOT: lang_rt.quadmath +! SOLARIS-F128NONE-NOT: flang_rt.quadmath +! UNIX-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed" +! SOLARIS-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "-z" "ignore" "-lquadmath" "-z" "record" +! UNIX-SAME: "-lflang_rt.runtime" "-lm" ! COMPILER-RT: "{{.*}}{{\\|/}}libclang_rt.builtins.a" ! BSD-LABEL: "{{.*}}ld{{(\.exe)?}}" ! BSD-SAME: "[[object_file]]" -! BSD-F128NONE-NOT: FortranFloat128Math -! BSD-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed" -! BSD-SAME: -lFortranRuntime +! BSD-F128NONE-NOT: flang_rt.quadmath +! BSD-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed" +! BSD-SAME: -lflang_rt.runtime ! BSD-SAME: -lexecinfo ! DARWIN-LABEL: "{{.*}}ld{{(\.exe)?}}" ! DARWIN-SAME: "[[object_file]]" -! DARWIN-F128NONE-NOT: FortranFloat128Math -! DARWIN-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed" -! DARWIN-SAME: -lFortranRuntime +! DARWIN-F128NONE-NOT: libflang_rt.quadmath +! DARWIN-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed" +! DARWIN-SAME: -lflang_rt.runtime ! HAIKU-LABEL: "{{.*}}ld{{(\.exe)?}}" ! HAIKU-SAME: "[[object_file]]" -! HAIKU-F128NONE-NOT: FortranFloat128Math -! HAIKU-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed" -! HAIKU-SAME: "-lFortranRuntime" +! HAIKU-F128NONE-NOT: libflang_rt.quadmath +! HAIKU-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed" +! HAIKU-SAME: "-lflang_rt.runtime" ! MINGW-LABEL: "{{.*}}ld{{(\.exe)?}}" ! MINGW-SAME: "[[object_file]]" -! MINGW-F128NONE-NOT: FortranFloat128Math -! MINGW-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed" -! MINGW-SAME: -lFortranRuntime +! MINGW-F128NONE-NOT: libflang_rt.quadmath +! MINGW-F128LIBQUADMATH-SAME: "-lflang_rt.quadmath" "--as-needed" "-lquadmath" "--no-as-needed" +! MINGW-SAME: -lflang_rt.runtime ! NOTE: This also matches lld-link (when CLANG_DEFAULT_LINKER=lld) and ! any .exe suffix that is added when resolving to the full path of diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90 index befe61fdadcd1..641c73912c4d1 100644 --- a/flang/test/Driver/msvc-dependent-lib-flags.f90 +++ b/flang/test/Driver/msvc-dependent-lib-flags.f90 @@ -7,21 +7,21 @@ ! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib ! MSVC-SAME: -D_MT ! MSVC-SAME: --dependent-lib=libcmt -! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib +! MSVC-SAME: --dependent-lib=flang_rt.runtime.static.lib ! MSVC-DEBUG: -fc1 ! MSVC-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib ! MSVC-DEBUG-SAME: -D_MT ! MSVC-DEBUG-SAME: -D_DEBUG ! MSVC-DEBUG-SAME: --dependent-lib=libcmtd -! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib +! MSVC-DEBUG-SAME: --dependent-lib=flang_rt.runtime.static_dbg.lib ! MSVC-DLL: -fc1 ! MSVC-DLL-SAME: --dependent-lib=clang_rt.builtins.lib ! MSVC-DLL-SAME: -D_MT ! MSVC-DLL-SAME: -D_DLL ! MSVC-DLL-SAME: --dependent-lib=msvcrt -! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib +! MSVC-DLL-SAME: --dependent-lib=flang_rt.runtime.dynamic.lib ! MSVC-DLL-DEBUG: -fc1 ! MSVC-DLL-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib @@ -29,4 +29,4 @@ ! MSVC-DLL-DEBUG-SAME: -D_DEBUG ! MSVC-DLL-DEBUG-SAME: -D_DLL ! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd -! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib +! MSVC-DLL-DEBUG-SAME: --dependent-lib=flang_rt.runtime.dynamic_dbg.lib diff --git a/flang/test/Driver/nostdlib.f90 b/flang/test/Driver/nostdlib.f90 index ab7c675fe7b77..dc23be6462376 100644 --- a/flang/test/Driver/nostdlib.f90 +++ b/flang/test/Driver/nostdlib.f90 @@ -24,5 +24,5 @@ ! in certain cases. But it is not clear that it is worth checking for each ! platform individually. -! CHECK-NOT: "-lFortranRuntime" +! CHECK-NOT: "-lflang_rt.runtime" ! CHECK-NOT: "-lgcc" diff --git a/flang/test/Runtime/no-cpp-dep.c b/flang/test/Runtime/no-cpp-dep.c index 7303ce63fdec4..4fcf8f9d478d8 100644 --- a/flang/test/Runtime/no-cpp-dep.c +++ b/flang/test/Runtime/no-cpp-dep.c @@ -1,6 +1,6 @@ /* This test makes sure that flang's runtime does not depend on the C++ runtime -library. It tries to link this simple file against libFortranRuntime.a with +library. It tries to link this simple file against libflang_rt.runtime.a with a C compiler. REQUIRES: c-compiler, flang-rt diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py index f4580afc8c47b..c6266f3976f7c 100644 --- a/flang/test/lit.cfg.py +++ b/flang/test/lit.cfg.py @@ -170,7 +170,7 @@ # the C++ runtime libraries. For this we need a C compiler. If for some reason # we don't have one, we can just disable the test. if config.flang_include_runtime and config.cc: - libruntime = os.path.join(config.flang_lib_dir, "libFortranRuntime.a") + libruntime = os.path.join(config.flang_lib_dir, "libflang_rt.runtime.a") include = os.path.join(config.flang_src_dir, "include") if ( diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index 85ba2c74cdeb5..5b5f23b5dc73c 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -5,7 +5,7 @@ set(LLVM_LINK_COMPONENTS ) # Define the list of Fortran module files that need to be compiled -# to produce an object file for inclusion into the FortranRuntime +# to produce an object file for inclusion into the flang_rt.runtime # library. set(MODULES_WITH_IMPLEMENTATION "iso_fortran_env_impl" @@ -105,7 +105,7 @@ if (NOT CMAKE_CROSSCOMPILING) endif() # Some modules have an implementation part that needs to be added to the - # FortranRuntime library. + # flang_rt.runtime library. set(compile_with "-fsyntax-only") set(object_output "") set(include_in_link FALSE) @@ -127,14 +127,14 @@ if (NOT CMAKE_CROSSCOMPILING) install(FILES ${base}.mod DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/flang") # If a module has been compiled into an object file, add the file to - # the link line for the FortranRuntime library. + # the link line for the flang_rt.runtime library. if(include_in_link) list(APPEND module_objects ${object_output}) endif() endforeach() # Set a CACHE variable that is visible to the CMakeLists.txt in runtime/, so that - # the compiled Fortran modules can be added to the link line of the FortranRuntime + # the compiled Fortran modules can be added to the link line of the flang_rt.runtime # library. set(FORTRAN_MODULE_OBJECTS ${module_objects} CACHE INTERNAL "" FORCE) diff --git a/flang/unittests/CMakeLists.txt b/flang/unittests/CMakeLists.txt index ecb7d68d8f729..c54ceb3332abf 100644 --- a/flang/unittests/CMakeLists.txt +++ b/flang/unittests/CMakeLists.txt @@ -2,7 +2,7 @@ include(AddFlangOffloadRuntime) if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) # If Fortran runtime is built as CUDA library, the linking - # of targets that link FortranRuntime must be done + # of targets that link flang_rt.runtime must be done # with CUDA_RESOLVE_DEVICE_SYMBOLS. # CUDA language must be enabled for CUDA_RESOLVE_DEVICE_SYMBOLS # to take effect. diff --git a/flang/unittests/Evaluate/CMakeLists.txt b/flang/unittests/Evaluate/CMakeLists.txt index 1c3fac29cd298..2278d61febcb1 100644 --- a/flang/unittests/Evaluate/CMakeLists.txt +++ b/flang/unittests/Evaluate/CMakeLists.txt @@ -60,14 +60,14 @@ if (FLANG_INCLUDE_RUNTIME) NonGTestTesting FortranSemantics FortranEvaluate - FortranRuntime + flang_rt.runtime ) add_flang_nongtest_unittest(ISO-Fortran-binding NonGTestTesting FortranEvaluate FortranSemantics - FortranRuntime + flang_rt.runtime ) endif () diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt index 179e439917ff2..f3743be49b015 100644 --- a/flang/unittests/Runtime/CMakeLists.txt +++ b/flang/unittests/Runtime/CMakeLists.txt @@ -33,7 +33,7 @@ add_flang_unittest(FlangRuntimeTests target_link_libraries(FlangRuntimeTests PRIVATE - FortranRuntime + flang_rt.runtime ) target_compile_definitions(FlangRuntimeTests PRIVATE NOT_EXE="$") diff --git a/flang/unittests/Runtime/CUDA/CMakeLists.txt b/flang/unittests/Runtime/CUDA/CMakeLists.txt index a7fe604d687bd..860b2664d623b 100644 --- a/flang/unittests/Runtime/CUDA/CMakeLists.txt +++ b/flang/unittests/Runtime/CUDA/CMakeLists.txt @@ -15,8 +15,8 @@ endif() target_link_libraries(FlangCufRuntimeTests PRIVATE ${CUDA_RT_TARGET} - CufRuntime_cuda_${CUDAToolkit_VERSION_MAJOR} - FortranRuntime + flang_rt.cuda_${CUDAToolkit_VERSION_MAJOR} + flang_rt.runtime ) target_include_directories(FlangCufRuntimeTests PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp index 0786353b06432..62db04cbe507e 100644 --- a/lld/COFF/MinGW.cpp +++ b/lld/COFF/MinGW.cpp @@ -49,7 +49,7 @@ AutoExporter::AutoExporter( "libclang_rt.profile-x86_64", "libc++", "libc++abi", - "libFortranRuntime", + "libflang_rt.runtime", "libunwind", "libmsvcrt", "libucrtbase", From 7f2f905361558b9137855b00debfdcc5eb057729 Mon Sep 17 00:00:00 2001 From: Vasileios Porpodas Date: Sat, 8 Feb 2025 09:05:33 -0800 Subject: [PATCH 066/293] [SandboxVec] Fix: Add missing lit.local.cfg for target test --- llvm/test/Transforms/SandboxVectorizer/X86/lit.local.cfg | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 llvm/test/Transforms/SandboxVectorizer/X86/lit.local.cfg diff --git a/llvm/test/Transforms/SandboxVectorizer/X86/lit.local.cfg b/llvm/test/Transforms/SandboxVectorizer/X86/lit.local.cfg new file mode 100644 index 0000000000000..42bf50dcc13c3 --- /dev/null +++ b/llvm/test/Transforms/SandboxVectorizer/X86/lit.local.cfg @@ -0,0 +1,2 @@ +if not "X86" in config.root.targets: + config.unsupported = True From 9266b48c5b28d4633cf7671c10c2aa52e22d4d65 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 8 Feb 2025 18:09:44 +0000 Subject: [PATCH 067/293] [VPlan] Add outer loop tests with wide phis in inner loop. Add test coverage with phis outside a header block with multiple incoming values. --- .../LoopVectorize/outer-loop-wide-phis.ll | 224 ++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll new file mode 100644 index 0000000000000..3f81c0f5c822a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt -passes=loop-vectorize -enable-vplan-native-path -S %s | FileCheck %s + +define void @wide_phi_2_predecessors(ptr noalias %A, ptr noalias %B, i32 %c, i1 %cond) { +; CHECK-LABEL: define void @wide_phi_2_predecessors( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[C:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true)) +; CHECK-NEXT: br label %[[INNER_HEADER1:.*]] +; CHECK: [[INNER_HEADER1]]: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER_LATCH4:.*]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[INNER_LATCH4]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[B]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: br i1 [[COND]], label %[[THEN3:.*]], label %[[INNER_LATCH4]] +; CHECK: [[THEN3]]: +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) +; CHECK-NEXT: br label %[[INNER_LATCH4]] +; CHECK: [[INNER_LATCH4]]: +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[THEN3]] ], [ zeroinitializer, %[[INNER_HEADER1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i64> [[VEC_PHI5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP3]] = add nsw <4 x i64> [[TMP2]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 1000) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[INNER_HEADER1]] +; CHECK: [[VECTOR_LATCH]]: +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH4]] ] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] +; CHECK: [[OUTER_HEADER]]: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OUTER_IV]] +; CHECK-NEXT: store i32 [[C]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: br label %[[INNER_HEADER:.*]] +; CHECK: [[INNER_HEADER]]: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], %[[INNER_LATCH:.*]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[RED_NEXT:%.*]], %[[INNER_LATCH]] ] +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INNER_IV]] +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[INNER_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[L_B:%.*]] = load i64, ptr [[GEP_B]], align 8 +; CHECK-NEXT: br label %[[INNER_LATCH]] +; CHECK: [[INNER_LATCH]]: +; CHECK-NEXT: [[P:%.*]] = phi i64 [ [[L_B]], %[[THEN]] ], [ 0, %[[INNER_HEADER]] ] +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i64 [[P]], [[OUTER_IV]] +; CHECK-NEXT: [[RED_NEXT]] = add nsw i64 [[ADD_1]], [[RED]] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1 +; CHECK-NEXT: [[INNER_EC:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[INNER_EC]], label %[[OUTER_LATCH]], label %[[INNER_HEADER]] +; CHECK: [[OUTER_LATCH]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[INNER_LATCH]] ] +; CHECK-NEXT: store i64 [[RED_NEXT_LCSSA]], ptr [[GEP_A]], align 8 +; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 +; CHECK-NEXT: [[OUTER_EC:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[OUTER_EC]], label %[[EXIT]], label %[[OUTER_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %outer.header + +outer.header: ; preds = %outer.latch, %outer.header.lr.ph + %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ] + %gep.A = getelementptr inbounds i64, ptr %A, i64 %outer.iv + store i32 %c, ptr %gep.A, align 4 + br label %inner.header + +inner.header: + %inner.iv = phi i64 [ 0, %outer.header ], [ %inner.iv.next, %inner.latch ] + %red = phi i64 [ 0, %outer.header ], [ %red.next, %inner.latch ] + %gep.B = getelementptr inbounds i64, ptr %B, i64 %inner.iv + br i1 %cond, label %then, label %inner.latch + +then: + %l.b = load i64, ptr %gep.B, align 8 + br label %inner.latch + +inner.latch: + %p = phi i64 [ %l.b, %then ], [ 0, %inner.header ] + %add.1 = add nsw i64 %p, %outer.iv + %red.next = add nsw i64 %add.1, %red + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %inner.ec = icmp eq i64 %inner.iv.next, 1000 + br i1 %inner.ec, label %outer.latch, label %inner.header + +outer.latch: + store i64 %red.next, ptr %gep.A, align 8 + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %outer.ec = icmp eq i64 %outer.iv.next, 1000 + br i1 %outer.ec, label %exit, label %outer.header, !llvm.loop !1 + +exit: + ret void +} + +define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias %B, i32 %c, i1 %cond) { +; CHECK-LABEL: define void @wide_phi_2_predecessors_phi_ops_swapped( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i32 [[C:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true)) +; CHECK-NEXT: br label %[[INNER_HEADER1:.*]] +; CHECK: [[INNER_HEADER1]]: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER_LATCH4:.*]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[INNER_LATCH4]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[B]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: br i1 [[COND]], label %[[THEN3:.*]], label %[[INNER_LATCH4]] +; CHECK: [[THEN3]]: +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) +; CHECK-NEXT: br label %[[INNER_LATCH4]] +; CHECK: [[INNER_LATCH4]]: +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ zeroinitializer, %[[INNER_HEADER1]] ], [ [[WIDE_MASKED_GATHER]], %[[THEN3]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i64> [[VEC_PHI5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP3]] = add nsw <4 x i64> [[TMP2]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 1000) +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[INNER_HEADER1]] +; CHECK: [[VECTOR_LATCH]]: +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH4]] ] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] +; CHECK: [[OUTER_HEADER]]: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[OUTER_IV_NEXT:%.*]], %[[OUTER_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OUTER_IV]] +; CHECK-NEXT: store i32 [[C]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: br label %[[INNER_HEADER:.*]] +; CHECK: [[INNER_HEADER]]: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], %[[INNER_LATCH:.*]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ 0, %[[OUTER_HEADER]] ], [ [[RED_NEXT:%.*]], %[[INNER_LATCH]] ] +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INNER_IV]] +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[INNER_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[L_B:%.*]] = load i64, ptr [[GEP_B]], align 8 +; CHECK-NEXT: br label %[[INNER_LATCH]] +; CHECK: [[INNER_LATCH]]: +; CHECK-NEXT: [[P:%.*]] = phi i64 [ 0, %[[INNER_HEADER]] ], [ [[L_B]], %[[THEN]] ] +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i64 [[P]], [[OUTER_IV]] +; CHECK-NEXT: [[RED_NEXT]] = add nsw i64 [[ADD_1]], [[RED]] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1 +; CHECK-NEXT: [[INNER_EC:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[INNER_EC]], label %[[OUTER_LATCH]], label %[[INNER_HEADER]] +; CHECK: [[OUTER_LATCH]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[INNER_LATCH]] ] +; CHECK-NEXT: store i64 [[RED_NEXT_LCSSA]], ptr [[GEP_A]], align 8 +; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 +; CHECK-NEXT: [[OUTER_EC:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[OUTER_EC]], label %[[EXIT]], label %[[OUTER_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %outer.header + +outer.header: ; preds = %outer.latch, %outer.header.lr.ph + %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %outer.latch ] + %gep.A = getelementptr inbounds i64, ptr %A, i64 %outer.iv + store i32 %c, ptr %gep.A, align 4 + br label %inner.header + +inner.header: + %inner.iv = phi i64 [ 0, %outer.header ], [ %inner.iv.next, %inner.latch ] + %red = phi i64 [ 0, %outer.header ], [ %red.next, %inner.latch ] + %gep.B = getelementptr inbounds i64, ptr %B, i64 %inner.iv + br i1 %cond, label %then, label %inner.latch + +then: + %l.b = load i64, ptr %gep.B, align 8 + br label %inner.latch + +inner.latch: + %p = phi i64 [ 0, %inner.header ], [ %l.b, %then ] + %add.1 = add nsw i64 %p, %outer.iv + %red.next = add nsw i64 %add.1, %red + %inner.iv.next = add nuw nsw i64 %inner.iv, 1 + %inner.ec = icmp eq i64 %inner.iv.next, 1000 + br i1 %inner.ec, label %outer.latch, label %inner.header + +outer.latch: + store i64 %red.next, ptr %gep.A, align 8 + %outer.iv.next = add nuw nsw i64 %outer.iv, 1 + %outer.ec = icmp eq i64 %outer.iv.next, 1000 + br i1 %outer.ec, label %exit, label %outer.header, !llvm.loop !1 + +exit: + ret void +} + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.enable", i1 true} From ca9c0486cccba08dc6a3489176cbd7f38bad8e63 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 8 Feb 2025 19:32:14 +0000 Subject: [PATCH 068/293] [ARM] Silence "enumerated and non-enumerated type in conditional expression" warning. NFC Fixes #125543 --- llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 6e2886a192923..dfaad24e1b877 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -1395,7 +1395,7 @@ static MCAsmBackend *createARMAsmBackend(const Target &T, case Triple::ELF: assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target"); uint8_t OSABI = Options.FDPIC - ? ELF::ELFOSABI_ARM_FDPIC + ? static_cast(ELF::ELFOSABI_ARM_FDPIC) : MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); return new ARMAsmBackendELF(T, STI.getTargetTriple().isThumb(), OSABI, Endian); From 451007173abaeff7de70d6d7fb0135b7858d093d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 11:35:17 -0800 Subject: [PATCH 069/293] [Analysis] Avoid repeated hash lookups (NFC) (#126378) --- clang/lib/Analysis/UninitializedValues.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/lib/Analysis/UninitializedValues.cpp b/clang/lib/Analysis/UninitializedValues.cpp index bf2f730618650..3a052eb27a444 100644 --- a/clang/lib/Analysis/UninitializedValues.cpp +++ b/clang/lib/Analysis/UninitializedValues.cpp @@ -379,8 +379,10 @@ void ClassifyRefs::classify(const Expr *E, Class C) { } FindVarResult Var = findVar(E, DC); - if (const DeclRefExpr *DRE = Var.getDeclRefExpr()) - Classification[DRE] = std::max(Classification[DRE], C); + if (const DeclRefExpr *DRE = Var.getDeclRefExpr()) { + auto &Class = Classification[DRE]; + Class = std::max(Class, C); + } } void ClassifyRefs::VisitDeclStmt(DeclStmt *DS) { From 2fee5ef2356b514dda30e89f39125a390c0d928e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 11:35:45 -0800 Subject: [PATCH 070/293] [ByteCode] Avoid repeated hash lookups (NFC) (#126379) --- clang/lib/AST/ByteCode/Program.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp index e0b86d46428a2..833c9ef88d770 100644 --- a/clang/lib/AST/ByteCode/Program.cpp +++ b/clang/lib/AST/ByteCode/Program.cpp @@ -18,14 +18,12 @@ using namespace clang; using namespace clang::interp; unsigned Program::getOrCreateNativePointer(const void *Ptr) { - auto It = NativePointerIndices.find(Ptr); - if (It != NativePointerIndices.end()) - return It->second; + auto [It, Inserted] = + NativePointerIndices.try_emplace(Ptr, NativePointers.size()); + if (Inserted) + NativePointers.push_back(Ptr); - unsigned Idx = NativePointers.size(); - NativePointers.push_back(Ptr); - NativePointerIndices[Ptr] = Idx; - return Idx; + return It->second; } const void *Program::getNativePointer(unsigned Idx) { From 7628fcf3d43eb20c292ab0dd25ba3f52dba248a6 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 11:36:05 -0800 Subject: [PATCH 071/293] [CrossTU] Avoid repeated hash lookups (NFC) (#126380) --- clang/lib/CrossTU/CrossTranslationUnit.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/lib/CrossTU/CrossTranslationUnit.cpp b/clang/lib/CrossTU/CrossTranslationUnit.cpp index 9faf2a8a17341..ad2ebb6cd6e6c 100644 --- a/clang/lib/CrossTU/CrossTranslationUnit.cpp +++ b/clang/lib/CrossTU/CrossTranslationUnit.cpp @@ -453,7 +453,8 @@ CrossTranslationUnitContext::ASTUnitStorage::getASTUnitForFunction( return std::move(IndexLoadError); // Check if there is an entry in the index for the function. - if (!NameFileMap.count(FunctionName)) { + auto It = NameFileMap.find(FunctionName); + if (It == NameFileMap.end()) { ++NumNotInOtherTU; return llvm::make_error(index_error_code::missing_definition); } @@ -461,7 +462,7 @@ CrossTranslationUnitContext::ASTUnitStorage::getASTUnitForFunction( // Search in the index for the filename where the definition of FunctionName // resides. if (llvm::Expected FoundForFile = - getASTUnitForFile(NameFileMap[FunctionName], DisplayCTUProgress)) { + getASTUnitForFile(It->second, DisplayCTUProgress)) { // Update the cache. NameASTUnitMap[FunctionName] = *FoundForFile; From cf5947be13e7af67219379e07bc0128f1f1e7f88 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 11:36:35 -0800 Subject: [PATCH 072/293] [TableGen] Avoid repeated map lookups (NFC) (#126381) --- clang/utils/TableGen/ClangOptionDocEmitter.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/clang/utils/TableGen/ClangOptionDocEmitter.cpp b/clang/utils/TableGen/ClangOptionDocEmitter.cpp index e08fb11df3100..b6c1aad90b5cb 100644 --- a/clang/utils/TableGen/ClangOptionDocEmitter.cpp +++ b/clang/utils/TableGen/ClangOptionDocEmitter.cpp @@ -109,13 +109,17 @@ Documentation extractDocumentation(const RecordKeeper &Records, // Pretend no-X and Xno-Y options are aliases of X and XY. std::string Name = std::string(R->getValueAsString("Name")); if (Name.size() >= 4) { - if (Name.substr(0, 3) == "no-" && OptionsByName[Name.substr(3)]) { - Aliases[OptionsByName[Name.substr(3)]].push_back(R); - continue; + if (Name.substr(0, 3) == "no-") { + if (const Record *Opt = OptionsByName[Name.substr(3)]) { + Aliases[Opt].push_back(R); + continue; + } } - if (Name.substr(1, 3) == "no-" && OptionsByName[Name[0] + Name.substr(4)]) { - Aliases[OptionsByName[Name[0] + Name.substr(4)]].push_back(R); - continue; + if (Name.substr(1, 3) == "no-") { + if (const Record *Opt = OptionsByName[Name[0] + Name.substr(4)]) { + Aliases[Opt].push_back(R); + continue; + } } } From 1e0a48967102780a3caad09e874539869d04110d Mon Sep 17 00:00:00 2001 From: YongKang Zhu Date: Sat, 8 Feb 2025 14:02:46 -0800 Subject: [PATCH 073/293] [BOLT] Resolve symlink for library lookup (#126386) --- bolt/lib/RuntimeLibs/RuntimeLibrary.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp index 336c6768a7f71..8f5719e84ecea 100644 --- a/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp +++ b/bolt/lib/RuntimeLibs/RuntimeLibrary.cpp @@ -18,6 +18,7 @@ #include "llvm/Object/Archive.h" #include "llvm/Object/ObjectFile.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" #define DEBUG_TYPE "bolt-rtlib" @@ -38,6 +39,23 @@ std::string RuntimeLibrary::getLibPathByToolPath(StringRef ToolPath, llvm::sys::path::append(LibPath, "lib" LLVM_LIBDIR_SUFFIX); } llvm::sys::path::append(LibPath, LibFileName); + if (!llvm::sys::fs::exists(LibPath)) { + // If it is a symlink, check the directory that the symlink points to. + if (llvm::sys::fs::is_symlink_file(ToolPath)) { + SmallString<256> RealPath; + llvm::sys::fs::real_path(ToolPath, RealPath); + if (llvm::ErrorOr P = + llvm::sys::findProgramByName(RealPath)) { + outs() << "BOLT-INFO: library not found: " << LibPath << "\n" + << "BOLT-INFO: " << ToolPath << " is a symlink; will look up " + << LibFileName + << " at the target directory that the symlink points to\n"; + return getLibPath(*P, LibFileName); + } + } + errs() << "BOLT-ERROR: library not found: " << LibPath << "\n"; + exit(1); + } return std::string(LibPath); } From 8e61aae4a8ce938f42604b10123c3b21d4adc0b8 Mon Sep 17 00:00:00 2001 From: Wael Yehia Date: Sat, 8 Feb 2025 17:25:07 -0500 Subject: [PATCH 074/293] [profile] Add a clang option -fprofile-continuous that enables continuous instrumentation profiling mode (#124353) In Continuous instrumentation profiling mode, profile or coverage data collected via compiler instrumentation is continuously synced to the profile file. This feature has existed for a while, and is documented here: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html#running-the-instrumented-program This PR creates a user facing option to enable the feature. --------- Co-authored-by: Wael Yehia --- clang/docs/UsersManual.rst | 18 ++++++++ clang/include/clang/Basic/CodeGenOptions.def | 1 + clang/include/clang/Driver/Options.td | 5 ++ clang/lib/CodeGen/BackendUtil.cpp | 48 +++++++++++--------- clang/lib/Driver/ToolChains/Clang.cpp | 29 ++++++++++++ clang/test/CodeGen/profile-continuous.c | 11 +++++ clang/test/Driver/fprofile-continuous.c | 21 +++++++++ 7 files changed, 112 insertions(+), 21 deletions(-) create mode 100644 clang/test/CodeGen/profile-continuous.c create mode 100644 clang/test/Driver/fprofile-continuous.c diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index 0f2f313ad184a..d977868b8a2c6 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -3125,6 +3125,24 @@ indexed format, regardeless whether it is produced by frontend or the IR pass. overhead. ``prefer-atomic`` will be transformed to ``atomic`` when supported by the target, or ``single`` otherwise. +.. option:: -fprofile-continuous + + Enables the continuous instrumentation profiling where profile counter updates + are continuously synced to a file. This option sets any neccessary modifiers + (currently ``%c``) in the default profile filename and passes any necessary + flags to the middle-end to support this mode. Value profiling is not supported + in continuous mode. + + .. code-block:: console + + $ clang++ -O2 -fprofile-generate -fprofile-continuous code.cc -o code + + Running ``./code`` will collect the profile and write it to the + ``default_xxxx.profraw`` file. However, if ``./code`` abruptly terminates or + does not call ``exit()``, in continuous mode the profile collected up to the + point of termination will be available in ``default_xxxx.profraw`` while in + the non-continuous mode, no profile file is generated. + .. option:: -ftemporal-profile Enables the temporal profiling extension for IRPGO to improve startup time by diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 68831093c6ad8..a7f5f1abbb825 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -221,6 +221,7 @@ AFFECTING_VALUE_CODEGENOPT(OptimizationLevel, 2, 0) ///< The -O[0-3] option spec AFFECTING_VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is specified. CODEGENOPT(AtomicProfileUpdate , 1, 0) ///< Set -fprofile-update=atomic +CODEGENOPT(ContinuousProfileSync, 1, 0) ///< Enable continuous instrumentation profiling /// Choose profile instrumenation kind or no instrumentation. ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 2, ProfileNone) /// Choose profile kind for PGO use compilation. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index df226fd9e9aa2..c9d192a20ff1f 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1795,6 +1795,11 @@ def fprofile_update_EQ : Joined<["-"], "fprofile-update=">, Values<"atomic,prefer-atomic,single">, MetaVarName<"">, HelpText<"Set update method of profile counters">, MarshallingInfoFlag>; +def fprofile_continuous : Flag<["-"], "fprofile-continuous">, + Group, Visibility<[ClangOption, CC1Option]>, + HelpText<"Enable continuous instrumentation profiling mode">, + MarshallingInfoFlag>; + defm pseudo_probe_for_profiling : BoolFOption<"pseudo-probe-for-profiling", CodeGenOpts<"PseudoProbeForProfiling">, DefaultFalse, PosFlag, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 57106e4287765..1750719e17670 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -124,15 +124,25 @@ namespace clang { extern llvm::cl::opt ClSanitizeGuardChecks; } -namespace { - // Default filename used for profile generation. -std::string getDefaultProfileGenName() { +static std::string getDefaultProfileGenName() { return DebugInfoCorrelate || ProfileCorrelate != InstrProfCorrelator::NONE ? "default_%m.proflite" : "default_%m.profraw"; } +// Path and name of file used for profile generation +static std::string getProfileGenName(const CodeGenOptions &CodeGenOpts) { + std::string FileName = CodeGenOpts.InstrProfileOutput.empty() + ? getDefaultProfileGenName() + : CodeGenOpts.InstrProfileOutput; + if (CodeGenOpts.ContinuousProfileSync) + FileName = "%c" + FileName; + return FileName; +} + +namespace { + class EmitAssemblyHelper { CompilerInstance &CI; DiagnosticsEngine &Diags; @@ -551,7 +561,9 @@ getInstrProfOptions(const CodeGenOptions &CodeGenOpts, return std::nullopt; InstrProfOptions Options; Options.NoRedZone = CodeGenOpts.DisableRedZone; - Options.InstrProfileOutput = CodeGenOpts.InstrProfileOutput; + Options.InstrProfileOutput = CodeGenOpts.ContinuousProfileSync + ? ("%c" + CodeGenOpts.InstrProfileOutput) + : CodeGenOpts.InstrProfileOutput; Options.Atomic = CodeGenOpts.AtomicProfileUpdate; return Options; } @@ -822,13 +834,12 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (CodeGenOpts.hasProfileIRInstr()) // -fprofile-generate. - PGOOpt = PGOOptions( - CodeGenOpts.InstrProfileOutput.empty() ? getDefaultProfileGenName() - : CodeGenOpts.InstrProfileOutput, - "", "", CodeGenOpts.MemoryProfileUsePath, nullptr, PGOOptions::IRInstr, - PGOOptions::NoCSAction, ClPGOColdFuncAttr, - CodeGenOpts.DebugInfoForProfiling, - /*PseudoProbeForProfiling=*/false, CodeGenOpts.AtomicProfileUpdate); + PGOOpt = PGOOptions(getProfileGenName(CodeGenOpts), "", "", + CodeGenOpts.MemoryProfileUsePath, nullptr, + PGOOptions::IRInstr, PGOOptions::NoCSAction, + ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, + /*PseudoProbeForProfiling=*/false, + CodeGenOpts.AtomicProfileUpdate); else if (CodeGenOpts.hasProfileIRUse()) { // -fprofile-use. auto CSAction = CodeGenOpts.hasProfileCSIRUse() ? PGOOptions::CSIRUse @@ -872,18 +883,13 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PGOOpt->Action != PGOOptions::SampleUse && "Cannot run CSProfileGen pass with ProfileGen or SampleUse " " pass"); - PGOOpt->CSProfileGenFile = CodeGenOpts.InstrProfileOutput.empty() - ? getDefaultProfileGenName() - : CodeGenOpts.InstrProfileOutput; + PGOOpt->CSProfileGenFile = getProfileGenName(CodeGenOpts); PGOOpt->CSAction = PGOOptions::CSIRInstr; } else - PGOOpt = PGOOptions("", - CodeGenOpts.InstrProfileOutput.empty() - ? getDefaultProfileGenName() - : CodeGenOpts.InstrProfileOutput, - "", /*MemoryProfile=*/"", nullptr, - PGOOptions::NoAction, PGOOptions::CSIRInstr, - ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling); + PGOOpt = PGOOptions("", getProfileGenName(CodeGenOpts), "", + /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction, + PGOOptions::CSIRInstr, ClPGOColdFuncAttr, + CodeGenOpts.DebugInfoForProfiling); } if (TM) TM->setPGOOption(PGOOpt); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 0a6756eadba31..821407687ffa1 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -580,6 +580,7 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C, const ArgList &Args, SanitizerArgs &SanArgs, ArgStringList &CmdArgs) { const Driver &D = TC.getDriver(); + const llvm::Triple &T = TC.getTriple(); auto *PGOGenerateArg = Args.getLastArg(options::OPT_fprofile_generate, options::OPT_fprofile_generate_EQ, options::OPT_fno_profile_generate); @@ -785,6 +786,34 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C, D.Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << Val; } + if (const auto *A = Args.getLastArg(options::OPT_fprofile_continuous)) { + if (!PGOGenerateArg && !CSPGOGenerateArg && !ProfileGenerateArg) + D.Diag(clang::diag::err_drv_argument_only_allowed_with) + << A->getSpelling() + << "-fprofile-generate, -fprofile-instr-generate, or " + "-fcs-profile-generate"; + else { + CmdArgs.push_back("-fprofile-continuous"); + // Platforms that require a bias variable: + if (T.isOSBinFormatELF() || T.isOSAIX()) { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back("-runtime-counter-relocation"); + } + // -fprofile-instr-generate does not decide the profile file name in the + // FE, and so it does not define the filename symbol + // (__llvm_profile_filename). Instead, the runtime uses the name + // "default.profraw" for the profile file. When continuous mode is ON, we + // will create the filename symbol so that we can insert the "%c" + // modifier. + if (ProfileGenerateArg && + (ProfileGenerateArg->getOption().matches( + options::OPT_fprofile_instr_generate) || + (ProfileGenerateArg->getOption().matches( + options::OPT_fprofile_instr_generate_EQ) && + strlen(ProfileGenerateArg->getValue()) == 0))) + CmdArgs.push_back("-fprofile-instrument-path=default.profraw"); + } + } int FunctionGroups = 1; int SelectedFunctionGroup = 0; diff --git a/clang/test/CodeGen/profile-continuous.c b/clang/test/CodeGen/profile-continuous.c new file mode 100644 index 0000000000000..86fa1d149b971 --- /dev/null +++ b/clang/test/CodeGen/profile-continuous.c @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -emit-llvm -fprofile-instrument=llvm -fprofile-continuous %s -o - | FileCheck %s --check-prefix=IRPGO +// RUN: %clang_cc1 -emit-llvm -fprofile-instrument=llvm -fprofile-continuous -fprofile-instrument-path=mydir/default_%m.profraw -mllvm -runtime-counter-relocation %s -o - \ +// RUN: | FileCheck %s --check-prefix=IRPGO_EQ +// RUN: %clang_cc1 -emit-llvm -O2 -fprofile-instrument=csllvm -fprofile-continuous %s -o - | FileCheck %s --check-prefix=CSIRPGO +// RUN: %clang_cc1 -emit-llvm -fprofile-instrument=clang -fprofile-continuous -fprofile-instrument-path=default.profraw %s -o - | FileCheck %s --check-prefix=CLANG_PGO + +// IRPGO: @__llvm_profile_filename = {{.*}} c"%cdefault_%m.profraw\00" +// IRPGO_EQ: @__llvm_profile_filename = {{.*}} c"%cmydir/default_%m.profraw\00" +// CSIRPGO: @__llvm_profile_filename = {{.*}} c"%cdefault_%m.profraw\00" +// CLANG_PGO: @__llvm_profile_filename = {{.*}} c"%cdefault.profraw\00" +void foo(){} diff --git a/clang/test/Driver/fprofile-continuous.c b/clang/test/Driver/fprofile-continuous.c new file mode 100644 index 0000000000000..81719fb70cb1e --- /dev/null +++ b/clang/test/Driver/fprofile-continuous.c @@ -0,0 +1,21 @@ +// 1) test on platforms that (do or do not) require runtime relocation + +// RUN: %clang --target=x86_64-darwin -fprofile-generate -fprofile-continuous -### -c %s 2>&1 | FileCheck %s --check-prefix=NO_RELOC +// NO_RELOC: "-cc1" {{.*}} "-fprofile-continuous" +// NO_RELOC-NOT: "-mllvm" "-runtime-counter-relocation" + +// RUN: %clang --target=powerpc64-ibm-aix -fprofile-generate -fprofile-continuous -### -c %s 2>&1 | FileCheck %s --check-prefix=RELOC +// RUN: %clang --target=x86_64-unknown-fuchsia -fprofile-generate -fprofile-continuous -### -c %s 2>&1 | FileCheck %s --check-prefix=RELOC +// RELOC: "-cc1" {{.*}} "-fprofile-continuous" "-mllvm" "-runtime-counter-relocation" + +// 2) test -fprofile-continuous with cs-profile-generate and -fprofile-instr-generate + +// RUN: %clang --target=powerpc-ibm-aix -fprofile-instr-generate -fprofile-continuous -### -c %s 2>&1 | FileCheck %s --check-prefix=CLANG_PGO +// RUN: %clang --target=powerpc64le-unknown-linux -fprofile-instr-generate= -fprofile-continuous -### -c %s 2>&1 | FileCheck %s --check-prefix=CLANG_PGO +// CLANG_PGO: "-cc1" {{.*}} "-fprofile-continuous" "-mllvm" "-runtime-counter-relocation" "-fprofile-instrument-path=default.profraw" + +// RUN: %clang --target=x86_64-unknown-fuchsia -fcs-profile-generate -fprofile-continuous -### -c %s 2>&1 | FileCheck %s --check-prefix=RELOC + +// RUN: not %clang -fprofile-continuous -### -c %s 2>&1 | FileCheck %s --check-prefix=ERROR +// ERROR: error: invalid argument '-fprofile-continuous' only allowed with '-fprofile-generate, -fprofile-instr-generate, or -fcs-profile-generate' +void foo(){} From fec6d168bbdf5116d2f7aaa52f0f429916af4f2d Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Sat, 8 Feb 2025 15:50:52 -0800 Subject: [PATCH 075/293] [lldb] Upstream a few remaining Triple::XROS patches (#126335) Recognize the visionOS Triple::OSType::XROS os type. Some of these have already been landed on main, but I reviewed the downstream sources and there were a few that still needed to be landed upstream. --- lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp | 1 + .../DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp | 10 ++++++++-- .../Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp | 5 ++--- .../Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp | 3 +++ .../gdb-remote/GDBRemoteCommunicationServerCommon.cpp | 2 ++ lldb/tools/debugserver/source/RNBRemote.cpp | 4 ++++ 6 files changed, 20 insertions(+), 5 deletions(-) diff --git a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp index 54028b1b3261a..83b01b14aedc5 100644 --- a/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp +++ b/lldb/source/Plugins/ABI/X86/ABISysV_x86_64.cpp @@ -79,6 +79,7 @@ ABISysV_x86_64::CreateInstance(lldb::ProcessSP process_sp, const ArchSpec &arch) case llvm::Triple::OSType::IOS: case llvm::Triple::OSType::TvOS: case llvm::Triple::OSType::WatchOS: + case llvm::Triple::OSType::XROS: switch (os_env) { case llvm::Triple::EnvironmentType::MacABI: case llvm::Triple::EnvironmentType::Simulator: diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp index d512d6143639c..14d05a1a4494c 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp @@ -419,6 +419,8 @@ bool DynamicLoaderDarwin::JSONImageInformationIntoImageInfo( image_infos[i].os_type = llvm::Triple::WatchOS; else if (os_name == "bridgeos") image_infos[i].os_type = llvm::Triple::BridgeOS; + else if (os_name == "xros") + image_infos[i].os_type = llvm::Triple::XROS; else if (os_name == "maccatalyst") { image_infos[i].os_type = llvm::Triple::IOS; image_infos[i].os_env = llvm::Triple::MacABI; @@ -431,6 +433,9 @@ bool DynamicLoaderDarwin::JSONImageInformationIntoImageInfo( } else if (os_name == "watchossimulator") { image_infos[i].os_type = llvm::Triple::WatchOS; image_infos[i].os_env = llvm::Triple::Simulator; + } else if (os_name == "xrsimulator") { + image_infos[i].os_type = llvm::Triple::XROS; + image_infos[i].os_env = llvm::Triple::Simulator; } } if (image->HasKey("min_version_os_sdk")) { @@ -765,7 +770,8 @@ bool DynamicLoaderDarwin::AddModulesUsingPreloadedModules( (dyld_triple.getEnvironment() == llvm::Triple::Simulator && (dyld_triple.getOS() == llvm::Triple::IOS || dyld_triple.getOS() == llvm::Triple::TvOS || - dyld_triple.getOS() == llvm::Triple::WatchOS))) + dyld_triple.getOS() == llvm::Triple::WatchOS || + dyld_triple.getOS() == llvm::Triple::XROS))) image_module_sp->MergeArchitecture(dyld_spec); } } @@ -835,7 +841,7 @@ lldb_private::ArchSpec DynamicLoaderDarwin::ImageInfo::GetArchitecture() const { } if (os_env == llvm::Triple::Simulator && (os_type == llvm::Triple::IOS || os_type == llvm::Triple::TvOS || - os_type == llvm::Triple::WatchOS)) { + os_type == llvm::Triple::WatchOS || os_type == llvm::Triple::XROS)) { llvm::Triple triple(llvm::Twine(arch_spec.GetArchitectureName()) + "-apple-" + llvm::Triple::getOSTypeName(os_type) + min_version_os_sdk + "-simulator"); diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index bf2d293d2012c..4b69fa6e2bfb2 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -2848,7 +2848,7 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) { "DSC unmapped local symbol[{0}] has invalid " "string table offset {1:x} in {2}, ignoring symbol", nlist_index, nlist.n_strx, - module_sp->GetFileSpec().GetPath()); + module_sp->GetFileSpec().GetPath())); continue; } if (symbol_name[0] == '\0') @@ -6557,9 +6557,8 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp, target_triple.getOS() == llvm::Triple::IOS || target_triple.getOS() == llvm::Triple::WatchOS || target_triple.getOS() == llvm::Triple::TvOS || + target_triple.getOS() == llvm::Triple::BridgeOS || target_triple.getOS() == llvm::Triple::XROS)) { - // NEED_BRIDGEOS_TRIPLE target_triple.getOS() == llvm::Triple::BridgeOS)) - // { bool make_core = false; switch (target_arch.GetMachine()) { case llvm::Triple::aarch64: diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp index 6f75e5ea70b6a..605e3d5704969 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp @@ -126,6 +126,7 @@ PlatformSP PlatformDarwinKernel::CreateInstance(bool force, case llvm::Triple::MacOSX: case llvm::Triple::IOS: case llvm::Triple::WatchOS: + case llvm::Triple::XROS: case llvm::Triple::TvOS: case llvm::Triple::BridgeOS: break; @@ -329,6 +330,8 @@ void PlatformDarwinKernel::CollectKextAndKernelDirectories() { "/Platforms/AppleTVOS.platform/Developer/SDKs"); AddSDKSubdirsToSearchPaths(developer_dir + "/Platforms/WatchOS.platform/Developer/SDKs"); + AddSDKSubdirsToSearchPaths(developer_dir + + "/Platforms/XROS.platform/Developer/SDKs"); AddSDKSubdirsToSearchPaths(developer_dir + "/Platforms/BridgeOS.platform/Developer/SDKs"); } diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp index c2fe05cad566e..67ba42f33d1dd 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp @@ -212,6 +212,8 @@ GDBRemoteCommunicationServerCommon::Handle_qHostInfo( response.PutCString("ostype:tvos;"); #elif defined(TARGET_OS_WATCH) && TARGET_OS_WATCH == 1 response.PutCString("ostype:watchos;"); +#elif defined(TARGET_OS_XR) && TARGET_OS_XR == 1 + response.PutCString("ostype:xros;"); #elif defined(TARGET_OS_BRIDGE) && TARGET_OS_BRIDGE == 1 response.PutCString("ostype:bridgeos;"); #else diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp index efa015920c0d5..8a53094429aba 100644 --- a/lldb/tools/debugserver/source/RNBRemote.cpp +++ b/lldb/tools/debugserver/source/RNBRemote.cpp @@ -6369,6 +6369,8 @@ rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) { rep << "ostype:bridgeos;"; #elif defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1 rep << "ostype:macosx;"; +#elif defined(TARGET_OS_XR) && TARGET_OS_XR == 1 + rep << "ostype:xros;"; #else rep << "ostype:ios;"; #endif @@ -6422,6 +6424,8 @@ rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) { rep << "ostype:watchos;"; #elif defined(TARGET_OS_BRIDGE) && TARGET_OS_BRIDGE == 1 rep << "ostype:bridgeos;"; +#elif defined(TARGET_OS_XR) && TARGET_OS_XR == 1 + rep << "ostype:xros;"; #else rep << "ostype:ios;"; #endif From f9250401ef120a4605ad67bb43d3b25500900498 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 8 Feb 2025 15:58:12 -0800 Subject: [PATCH 076/293] [TableGen] Move formation of MoveSiblingMatcher earlier in ContractNodes. NFC ContractNodes recursively walks forward through a linked list. During this recursion, Matchers are combined into other Matchers. Previously the formation of MoveSiblingMatcher was after the recursive call so it occurred as we were unwinding. If a MoveSiblingMatcher was formed, we would recursively walk forward to the end of the linked list again which isn't efficient. To make this more efficient, move the formation of MoveSiblingMatcher to the forward pass. Add additional rules to unfold MoveSiblingMatcher if it would be more efficient to use CheckChildType, CheckChildInteger, CheckChildSame, etc. As an added benefit, this makes the function tail recursive which the compiler can better optimize. --- llvm/utils/TableGen/DAGISelMatcherOpt.cpp | 182 +++++++++++++++++++--- 1 file changed, 158 insertions(+), 24 deletions(-) diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp index 400534290a091..b10fe7e0661eb 100644 --- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp @@ -75,6 +75,164 @@ static void ContractNodes(std::unique_ptr &MatcherPtr, } } + // Turn MoveParent->MoveChild into MoveSibling. + if (auto *MP = dyn_cast(N)) { + if (auto *MC = dyn_cast(MP->getNext())) { + auto *MS = new MoveSiblingMatcher(MC->getChildNo()); + MS->setNext(MC->takeNext()); + MatcherPtr.reset(MS); + return ContractNodes(MatcherPtr, CGP); + } + } + + // Uncontract MoveSibling if it will help form other child operations. + if (auto *MS = dyn_cast(N)) { + if (auto *RM = dyn_cast(MS->getNext())) { + // Turn MoveSibling->Record->MoveParent into MoveParent->RecordChild. + if (auto *MP = dyn_cast(RM->getNext())) { + if (MS->getSiblingNo() < 8) { // Only have RecordChild0...7 + auto *NewMP = new MoveParentMatcher(); + auto *NewRCM = new RecordChildMatcher( + MS->getSiblingNo(), RM->getWhatFor(), RM->getResultNo()); + NewMP->setNext(NewRCM); + NewRCM->setNext(MP->takeNext()); + MatcherPtr.reset(NewMP); + return ContractNodes(MatcherPtr, CGP); + } + } + + // Turn MoveSibling->Record->CheckType->MoveParent into + // MoveParent->RecordChild->CheckChildType. + if (auto *CT = dyn_cast(RM->getNext())) { + if (auto *MP = dyn_cast(CT->getNext())) { + if (MS->getSiblingNo() < 8 && // Only have CheckChildType0...7 + CT->getResNo() == 0) { // CheckChildType checks res #0 + auto *NewMP = new MoveParentMatcher(); + auto *NewRCM = new RecordChildMatcher( + MS->getSiblingNo(), RM->getWhatFor(), RM->getResultNo()); + auto *NewCCT = + new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); + NewMP->setNext(NewRCM); + NewRCM->setNext(NewCCT); + NewCCT->setNext(MP->takeNext()); + MatcherPtr.reset(NewMP); + return ContractNodes(MatcherPtr, CGP); + } + } + } + } + + // Turn MoveSibling->CheckType->MoveParent into MoveParent->CheckChildType. + if (auto *CT = dyn_cast(MS->getNext())) { + if (auto *MP = dyn_cast(CT->getNext())) { + if (MS->getSiblingNo() < 8 && // Only have CheckChildType0...7 + CT->getResNo() == 0) { // CheckChildType checks res #0 + auto *NewMP = new MoveParentMatcher(); + auto *NewCCT = + new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); + NewMP->setNext(NewCCT); + NewCCT->setNext(MP->takeNext()); + MatcherPtr.reset(NewMP); + return ContractNodes(MatcherPtr, CGP); + } + } + } + + // Turn MoveSibling->CheckInteger->MoveParent into + // MoveParent->CheckChildInteger. + if (auto *CI = dyn_cast(MS->getNext())) { + if (auto *MP = dyn_cast(CI->getNext())) { + if (MS->getSiblingNo() < 5) { // Only have CheckChildInteger0...4 + auto *NewMP = new MoveParentMatcher(); + auto *NewCCI = + new CheckChildIntegerMatcher(MS->getSiblingNo(), CI->getValue()); + NewMP->setNext(NewCCI); + NewCCI->setNext(MP->takeNext()); + MatcherPtr.reset(NewMP); + return ContractNodes(MatcherPtr, CGP); + } + } + + // Turn MoveSibling->CheckInteger->CheckType->MoveParent into + // MoveParent->CheckChildInteger->CheckType. + if (auto *CT = dyn_cast(CI->getNext())) { + if (auto *MP = dyn_cast(CT->getNext())) { + if (MS->getSiblingNo() < 5 && // Only have CheckChildInteger0...4 + CT->getResNo() == 0) { // CheckChildType checks res #0 + auto *NewMP = new MoveParentMatcher(); + auto *NewCCI = new CheckChildIntegerMatcher(MS->getSiblingNo(), + CI->getValue()); + auto *NewCCT = + new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); + NewMP->setNext(NewCCI); + NewCCI->setNext(NewCCT); + NewCCT->setNext(MP->takeNext()); + MatcherPtr.reset(NewMP); + return ContractNodes(MatcherPtr, CGP); + } + } + } + } + + // Turn MoveSibling->CheckCondCode->MoveParent into + // MoveParent->CheckChild2CondCode. + if (auto *CCC = dyn_cast(MS->getNext())) { + if (auto *MP = dyn_cast(CCC->getNext())) { + if (MS->getSiblingNo() == 2) { // Only have CheckChild2CondCode + auto *NewMP = new MoveParentMatcher(); + auto *NewCCCC = + new CheckChild2CondCodeMatcher(CCC->getCondCodeName()); + NewMP->setNext(NewCCCC); + NewCCCC->setNext(MP->takeNext()); + MatcherPtr.reset(NewMP); + return ContractNodes(MatcherPtr, CGP); + } + } + } + + // Turn MoveSibling->CheckSame->MoveParent into + // MoveParent->CheckChildSame. + if (auto *CS = dyn_cast(MS->getNext())) { + if (auto *MP = dyn_cast(CS->getNext())) { + if (MS->getSiblingNo() < 4) { // Only have CheckChildSame0...3 + auto *NewMP = new MoveParentMatcher(); + auto *NewCCS = new CheckChildSameMatcher(MS->getSiblingNo(), + CS->getMatchNumber()); + NewMP->setNext(NewCCS); + NewCCS->setNext(MP->takeNext()); + MatcherPtr.reset(NewMP); + return ContractNodes(MatcherPtr, CGP); + } + } + + // Turn MoveSibling->CheckSame->CheckType->MoveParent into + // MoveParent->CheckChildSame->CheckChildType. + if (auto *CT = dyn_cast(CS->getNext())) { + if (auto *MP = dyn_cast(CT->getNext())) { + if (MS->getSiblingNo() < 4 && // Only have CheckChildSame0...3 + CT->getResNo() == 0) { // CheckChildType checks res #0 + auto *NewMP = new MoveParentMatcher(); + auto *NewCCS = new CheckChildSameMatcher(MS->getSiblingNo(), + CS->getMatchNumber()); + auto *NewCCT = + new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); + NewMP->setNext(NewCCS); + NewCCS->setNext(NewCCT); + NewCCT->setNext(MP->takeNext()); + MatcherPtr.reset(NewMP); + return ContractNodes(MatcherPtr, CGP); + } + } + } + } + + // Turn MoveSibling->MoveParent into MoveParent. + if (auto *MP = dyn_cast(MS->getNext())) { + MatcherPtr.reset(MS->takeNext()); + return ContractNodes(MatcherPtr, CGP); + } + } + // Zap movechild -> moveparent. if (MoveChildMatcher *MC = dyn_cast(N)) if (MoveParentMatcher *MP = dyn_cast(MC->getNext())) { @@ -153,30 +311,6 @@ static void ContractNodes(std::unique_ptr &MatcherPtr, } ContractNodes(N->getNextPtr(), CGP); - - // If we have a MoveParent followed by a MoveChild, we convert it to - // MoveSibling. - if (auto *MP = dyn_cast(N)) { - if (auto *MC = dyn_cast(MP->getNext())) { - auto *MS = new MoveSiblingMatcher(MC->getChildNo()); - MS->setNext(MC->takeNext()); - MatcherPtr.reset(MS); - return ContractNodes(MatcherPtr, CGP); - } - if (auto *RC = dyn_cast(MP->getNext())) { - if (auto *MC = dyn_cast(RC->getNext())) { - if (RC->getChildNo() == MC->getChildNo()) { - auto *MS = new MoveSiblingMatcher(MC->getChildNo()); - auto *RM = new RecordMatcher(RC->getWhatFor(), RC->getResultNo()); - // Insert the new node. - RM->setNext(MC->takeNext()); - MS->setNext(RM); - MatcherPtr.reset(MS); - return ContractNodes(MatcherPtr, CGP); - } - } - } - } } /// FindNodeWithKind - Scan a series of matchers looking for a matcher with a From 4aa71f0d4cccf1b06949c5a3c5ceb2e19250c7df Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 8 Feb 2025 16:03:44 -0800 Subject: [PATCH 077/293] [TableGen] Fix an unused variable warning. NFC --- llvm/utils/TableGen/DAGISelMatcherOpt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp index b10fe7e0661eb..ed062168dbc6e 100644 --- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp @@ -227,7 +227,7 @@ static void ContractNodes(std::unique_ptr &MatcherPtr, } // Turn MoveSibling->MoveParent into MoveParent. - if (auto *MP = dyn_cast(MS->getNext())) { + if (isa(MS->getNext())) { MatcherPtr.reset(MS->takeNext()); return ContractNodes(MatcherPtr, CGP); } From c40877d095eaa03d64e614723a69f1d68717f32a Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Sat, 8 Feb 2025 16:25:27 -0800 Subject: [PATCH 078/293] [RISCV] Attach an implicit source operand on vector copies (#126155) Somtimes when we're breaking up a large vector copy into several smaller ones, not every single smaller source registers are initialized at the time when the original COPY happens, and the verifier will not be pleased when seeing the smaller copies reading from an undef register. This patch is a workaround for the said issue by attaching an implicit read of the source operand on the newly generated copies. This is tested by llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll which would have crashed the compiler without this fix when LLVM_EXPENSIVE_CHECK is enabled. Original context: https://github.com/llvm/llvm-project/pull/124825#issuecomment-2639097531 --------- Co-authored-by: Craig Topper --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 5 ++++ .../test/CodeGen/RISCV/postra-copy-expand.mir | 24 +++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/postra-copy-expand.mir diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 773319ba908c8..1ec299e3c8cc0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -437,6 +437,11 @@ void RISCVInstrInfo::copyPhysRegVector( MIB.addReg(RISCV::VL, RegState::Implicit); MIB.addReg(RISCV::VTYPE, RegState::Implicit); } + // Add an implicit read of the original source to silence the verifier + // in the cases where some of the smaller VRs we're copying from might be + // undef, caused by the fact that the original, larger source VR might not + // be fully initialized at the time this COPY happens. + MIB.addReg(SrcReg, RegState::Implicit); // If we are copying reversely, we should decrease the encoding. SrcEncoding += (ReversedCopy ? -NumCopied : NumCopied); diff --git a/llvm/test/CodeGen/RISCV/postra-copy-expand.mir b/llvm/test/CodeGen/RISCV/postra-copy-expand.mir new file mode 100644 index 0000000000000..e5b85659a0340 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/postra-copy-expand.mir @@ -0,0 +1,24 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=postrapseudos %s -o - | FileCheck %s + +--- +name: copy +isSSA: false +noVRegs: true +liveins: + - { reg: '$v0', virtual-reg: '' } +body: | + bb.0: + liveins: $v0 + + ; CHECK-LABEL: name: copy + ; CHECK: liveins: $v0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $v20m2 = VMV2R_V $v14m2, implicit $vtype, implicit $v14_v15_v16_v17_v18 + ; CHECK-NEXT: $v22m2 = VMV2R_V $v16m2, implicit $vtype, implicit $v14_v15_v16_v17_v18 + ; CHECK-NEXT: $v24 = VMV1R_V $v18, implicit $vtype, implicit $v14_v15_v16_v17_v18, implicit $vtype + ; CHECK-NEXT: PseudoRET implicit $v0 + renamable $v20_v21_v22_v23_v24 = COPY renamable $v14_v15_v16_v17_v18, implicit $vtype + PseudoRET implicit $v0 + +... From 10ed0e406589604bf8ea5edd571a6f72dd8a6721 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 8 Feb 2025 16:36:46 -0800 Subject: [PATCH 079/293] [ELF] Reorder target-specific error messaes --- lld/ELF/Driver.cpp | 75 +++++++++++++------------- lld/test/ELF/target-specific-options.s | 2 +- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 2835b86d05e9c..3d6e022a89e5f 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -371,6 +371,9 @@ static void checkOptions(Ctx &ctx) { if (!ctx.arg.cmseOutputLib.empty()) ErrAlways(ctx) << "--out-implib may not be used without --cmse-implib"; } + if (ctx.arg.fixCortexA8 && !ctx.arg.isLE) + ErrAlways(ctx) + << "--fix-cortex-a8 is not supported on big endian targets"; } else { if (ctx.arg.cmseImplib) ErrAlways(ctx) << "--cmse-implib is only supported on ARM targets"; @@ -378,30 +381,46 @@ static void checkOptions(Ctx &ctx) { ErrAlways(ctx) << "--in-implib is only supported on ARM targets"; if (!ctx.arg.cmseOutputLib.empty()) ErrAlways(ctx) << "--out-implib is only supported on ARM targets"; + if (ctx.arg.fixCortexA8) + ErrAlways(ctx) << "--fix-cortex-a8 is only supported on ARM targets"; + if (ctx.arg.armBe8) + ErrAlways(ctx) << "--be8 is only supported on ARM targets"; } - if (ctx.arg.fixCortexA53Errata843419 && ctx.arg.emachine != EM_AARCH64) - ErrAlways(ctx) - << "--fix-cortex-a53-843419 is only supported on AArch64 targets"; - - if (ctx.arg.fixCortexA8 && ctx.arg.emachine != EM_ARM) - ErrAlways(ctx) << "--fix-cortex-a8 is only supported on ARM targets"; - - if (ctx.arg.armBe8 && ctx.arg.emachine != EM_ARM) - ErrAlways(ctx) << "--be8 is only supported on ARM targets"; - - if (ctx.arg.fixCortexA8 && !ctx.arg.isLE) - ErrAlways(ctx) << "--fix-cortex-a8 is not supported on big endian targets"; - - if (ctx.arg.tocOptimize && ctx.arg.emachine != EM_PPC64) - ErrAlways(ctx) << "--toc-optimize is only supported on PowerPC64 targets"; + if (ctx.arg.emachine != EM_AARCH64) { + if (ctx.arg.executeOnly) + ErrAlways(ctx) << "--execute-only is only supported on AArch64 targets"; + if (ctx.arg.fixCortexA53Errata843419) + ErrAlways(ctx) << "--fix-cortex-a53-843419 is only supported on AArch64"; + if (ctx.arg.zPacPlt) + ErrAlways(ctx) << "-z pac-plt only supported on AArch64"; + if (ctx.arg.zForceBti) + ErrAlways(ctx) << "-z force-bti only supported on AArch64"; + if (ctx.arg.zBtiReport != "none") + ErrAlways(ctx) << "-z bti-report only supported on AArch64"; + if (ctx.arg.zPauthReport != "none") + ErrAlways(ctx) << "-z pauth-report only supported on AArch64"; + if (ctx.arg.zGcsReport != "none") + ErrAlways(ctx) << "-z gcs-report only supported on AArch64"; + if (ctx.arg.zGcs != GcsPolicy::Implicit) + ErrAlways(ctx) << "-z gcs only supported on AArch64"; + } - if (ctx.arg.pcRelOptimize && ctx.arg.emachine != EM_PPC64) - ErrAlways(ctx) << "--pcrel-optimize is only supported on PowerPC64 targets"; + if (ctx.arg.emachine != EM_PPC64) { + if (ctx.arg.tocOptimize) + ErrAlways(ctx) << "--toc-optimize is only supported on PowerPC64 targets"; + if (ctx.arg.pcRelOptimize) + ErrAlways(ctx) + << "--pcrel-optimize is only supported on PowerPC64 targets"; + } if (ctx.arg.relaxGP && ctx.arg.emachine != EM_RISCV) ErrAlways(ctx) << "--relax-gp is only supported on RISC-V targets"; + if (ctx.arg.emachine != EM_386 && ctx.arg.emachine != EM_X86_64 && + ctx.arg.zCetReport != "none") + ErrAlways(ctx) << "-z cet-report only supported on X86 and X86_64"; + if (ctx.arg.pie && ctx.arg.shared) ErrAlways(ctx) << "-shared and -pie may not be used together"; @@ -435,9 +454,6 @@ static void checkOptions(Ctx &ctx) { } if (ctx.arg.executeOnly) { - if (ctx.arg.emachine != EM_AARCH64) - ErrAlways(ctx) << "--execute-only is only supported on AArch64 targets"; - if (ctx.arg.singleRoRx && !ctx.script->hasSectionsCommand) ErrAlways(ctx) << "--execute-only and --no-rosegment cannot be used together"; @@ -445,25 +461,6 @@ static void checkOptions(Ctx &ctx) { if (ctx.arg.zRetpolineplt && ctx.arg.zForceIbt) ErrAlways(ctx) << "-z force-ibt may not be used with -z retpolineplt"; - - if (ctx.arg.emachine != EM_AARCH64) { - if (ctx.arg.zPacPlt) - ErrAlways(ctx) << "-z pac-plt only supported on AArch64"; - if (ctx.arg.zForceBti) - ErrAlways(ctx) << "-z force-bti only supported on AArch64"; - if (ctx.arg.zBtiReport != "none") - ErrAlways(ctx) << "-z bti-report only supported on AArch64"; - if (ctx.arg.zPauthReport != "none") - ErrAlways(ctx) << "-z pauth-report only supported on AArch64"; - if (ctx.arg.zGcsReport != "none") - ErrAlways(ctx) << "-z gcs-report only supported on AArch64"; - if (ctx.arg.zGcs != GcsPolicy::Implicit) - ErrAlways(ctx) << "-z gcs only supported on AArch64"; - } - - if (ctx.arg.emachine != EM_386 && ctx.arg.emachine != EM_X86_64 && - ctx.arg.zCetReport != "none") - ErrAlways(ctx) << "-z cet-report only supported on X86 and X86_64"; } static const char *getReproduceOption(opt::InputArgList &args) { diff --git a/lld/test/ELF/target-specific-options.s b/lld/test/ELF/target-specific-options.s index 94dccd9e4a0c1..0f126f0186f8b 100644 --- a/lld/test/ELF/target-specific-options.s +++ b/lld/test/ELF/target-specific-options.s @@ -2,7 +2,7 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t # RUN: not ld.lld %t --fix-cortex-a53-843419 -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-843419 -# ERR-843419: error: --fix-cortex-a53-843419 is only supported on AArch64 targets +# ERR-843419: error: --fix-cortex-a53-843419 is only supported on AArch64 # RUN: not ld.lld %t --be8 -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-BE8 # ERR-BE8: error: --be8 is only supported on ARM targets From c89735d289f341985ca2ea74486b96bc611b3c64 Mon Sep 17 00:00:00 2001 From: Michael Kenzel Date: Sun, 9 Feb 2025 03:48:01 +0100 Subject: [PATCH 080/293] Remove dependence on (#73273) C++23 removed `` from the standard library. The header is used in two places: Once in order to pull in standard library macros. Since this file also includes ``, that use of `` is technically redundant, but should probably be left in in case a future change ever removes the include of ``. A second use of `` appears to have been introduced in da650094b187ee3c8017d74f63c885663faca1d8, but seems unnecessary (the file doesn't seem to use anything from that header, and it seems to build just fine on MSVC here without it). The new `` header should be supported by all supported implementations. This change replaces uses of `` with the `` header, or removes them entirely where unnecessary. --- llvm/include/llvm/Support/Threading.h | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h index 01e26ad9b858e..9972f4ad31dad 100644 --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -19,6 +19,7 @@ #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX #include "llvm/Support/Compiler.h" #include +#include #if defined(_MSC_VER) // MSVC's call_once implementation worked since VS 2015, which is the minimum From 7c60725fcf1038f6c84df396496cf52d67ab5b43 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Sat, 8 Feb 2025 20:22:15 -0800 Subject: [PATCH 081/293] Revert "Remove dependence on " (#126399) Reverts llvm/llvm-project#73273 --- llvm/include/llvm/Support/Threading.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h index 9972f4ad31dad..01e26ad9b858e 100644 --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -19,7 +19,6 @@ #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX #include "llvm/Support/Compiler.h" #include -#include #if defined(_MSC_VER) // MSVC's call_once implementation worked since VS 2015, which is the minimum From 59cbe2ff591d91e8375cfb4f4ba59dff49a82f4f Mon Sep 17 00:00:00 2001 From: Michael Park Date: Sat, 8 Feb 2025 22:29:23 -0800 Subject: [PATCH 082/293] [C++20][Modules][Serialization] Add an additional test case for #120277. (#126349) https://github.com/llvm/llvm-project/commit/4b35dd57b88a59b169c3471cbc398113d3bf98e8 was shipped to address https://github.com/llvm/llvm-project/issues/120277 . It was thought to be a regression in 19.x according to this comment: https://github.com/llvm/llvm-project/issues/120277#issuecomment-2558991129 This is a test case that fails even in 17.x but nevertheless is also fixed by: https://github.com/llvm/llvm-project/commit/4b35dd57b88a59b169c3471cbc398113d3bf98e8 --- clang/test/Modules/pr120277-2.cpp | 66 +++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 clang/test/Modules/pr120277-2.cpp diff --git a/clang/test/Modules/pr120277-2.cpp b/clang/test/Modules/pr120277-2.cpp new file mode 100644 index 0000000000000..f3a7e47431848 --- /dev/null +++ b/clang/test/Modules/pr120277-2.cpp @@ -0,0 +1,66 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t + +// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-01.h \ +// RUN: -o %t/hu-01.pcm + +// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-02.h \ +// RUN: -Wno-experimental-header-units -fmodule-file=%t/hu-01.pcm -o %t/hu-02.pcm + +// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-03.h \ +// RUN: -Wno-experimental-header-units \ +// RUN: -fmodule-file=%t/hu-01.pcm -o %t/hu-03.pcm + +// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header %t/hu-04.h \ +// RUN: -Wno-experimental-header-units -fmodule-file=%t/hu-02.pcm \ +// RUN: -fmodule-file=%t/hu-03.pcm -o %t/hu-04.pcm + +// RUN: %clang_cc1 -std=c++20 -emit-obj %t/main.cpp \ +// RUN: -Wno-experimental-header-units -fmodule-file=%t/hu-04.pcm +//--- hu-01.h +template +struct A { + ~A() { f(); } + auto f() const { return 0; } +}; + +template +struct B { + int g() const { return a.f(); } + A a; +}; + +//--- hu-02.h +import "hu-01.h"; + +template +struct C { + void h() { + B().g(); + } +}; + +template struct A; + +//--- hu-03.h +import "hu-01.h"; + +inline B b() { + return {}; +} + +//--- hu-04.h +import "hu-02.h"; +import "hu-03.h"; + +inline void f4() { + C{}.h(); +} + +//--- main.cpp +import "hu-04.h"; + +int main() { + f4(); +} From 66c31f5d024f3ec9f9afa74c340ba0a4e0776823 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 8 Feb 2025 23:17:06 -0800 Subject: [PATCH 083/293] [AMDGPU] Avoid repeated hash lookups (NFC) (#126401) This patch just cleans up the "if" condition. Further cleanups are left to subsequent patches. --- llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index f4e651ec477d3..b8109db821bcc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -367,11 +367,11 @@ bool LiveRegOptimizer::optimizeLiveType( for (Instruction *U : Uses) { // Replace all converted operands for a use. for (auto [OpIdx, Op] : enumerate(U->operands())) { - if (ValMap.contains(Op) && ValMap[Op]) { + if (Value *Val = ValMap.lookup(Op)) { Value *NewVal = nullptr; if (BBUseValMap.contains(U->getParent()) && - BBUseValMap[U->getParent()].contains(ValMap[Op])) - NewVal = BBUseValMap[U->getParent()][ValMap[Op]]; + BBUseValMap[U->getParent()].contains(Val)) + NewVal = BBUseValMap[U->getParent()][Val]; else { BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt(); // We may pick up ops that were previously converted for users in From 8d373ceaec1f1b27c9e682cfaf71aae19ea48d98 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 8 Feb 2025 23:22:33 -0800 Subject: [PATCH 084/293] [clang-format] Handle C-style cast of member function pointer type (#126340) Fixes #125012. --- clang/lib/Format/TokenAnnotator.cpp | 7 +++++-- clang/unittests/Format/TokenAnnotatorTest.cpp | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 94fd7ba9c0e79..b3540f39e6f69 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -477,8 +477,9 @@ class AnnotatingParser { FormatToken *PossibleObjCForInToken = nullptr; while (CurrentToken) { const auto &Prev = *CurrentToken->Previous; + const auto *PrevPrev = Prev.Previous; if (Prev.is(TT_PointerOrReference) && - Prev.Previous->isOneOf(tok::l_paren, tok::coloncolon)) { + PrevPrev->isOneOf(tok::l_paren, tok::coloncolon)) { ProbablyFunctionType = true; } if (CurrentToken->is(tok::comma)) @@ -486,8 +487,10 @@ class AnnotatingParser { if (Prev.is(TT_BinaryOperator)) Contexts.back().IsExpression = true; if (CurrentToken->is(tok::r_paren)) { - if (Prev.is(TT_PointerOrReference) && Prev.Previous == &OpeningParen) + if (Prev.is(TT_PointerOrReference) && + (PrevPrev == &OpeningParen || PrevPrev->is(tok::coloncolon))) { MightBeFunctionType = true; + } if (OpeningParen.isNot(TT_CppCastLParen) && MightBeFunctionType && ProbablyFunctionType && CurrentToken->Next && (CurrentToken->Next->is(tok::l_paren) || diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 1b09c45703456..54d8ff0571ca6 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -874,6 +874,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsCasts) { EXPECT_TOKEN(Tokens[14], tok::r_paren, TT_CastRParen); EXPECT_TOKEN(Tokens[15], tok::amp, TT_UnaryOperator); + Tokens = annotate("return (Foo (Bar::*)())&Bar::foo;"); + ASSERT_EQ(Tokens.size(), 17u) << Tokens; + EXPECT_TOKEN(Tokens[3], tok::l_paren, TT_FunctionTypeLParen); + EXPECT_TOKEN(Tokens[10], tok::r_paren, TT_CastRParen); + EXPECT_TOKEN(Tokens[11], tok::amp, TT_UnaryOperator); + auto Style = getLLVMStyle(); Style.TypeNames.push_back("Foo"); Tokens = annotate("#define FOO(bar) foo((Foo)&bar)", Style); From 7b348f9bfdb319fe9497c881311eaa0aa40fed88 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Sun, 9 Feb 2025 00:21:34 -0800 Subject: [PATCH 085/293] [MIR][NFC] Use `std::move` to avoid copying (#125930) --- llvm/lib/CodeGen/MIRPrinter.cpp | 18 +++++++++--------- llvm/lib/CodeGen/MachineFunction.cpp | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 0b41c90442a5d..e936b16531373 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -452,7 +452,7 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF, YamlObject.IsAliased = MFI.isAliasedObjectIndex(I); // Save the ID' position in FixedStackObjects storage vector. FixedStackObjectsIdx[ID] = YMF.FixedStackObjects.size(); - YMF.FixedStackObjects.push_back(YamlObject); + YMF.FixedStackObjects.push_back(std::move(YamlObject)); StackObjectOperandMapping.insert( std::make_pair(I, FrameIndexOperand::createFixed(ID))); } @@ -506,11 +506,11 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF, auto &Object = YMF.FixedStackObjects [FixedStackObjectsIdx[FrameIdx + MFI.getNumFixedObjects()]]; - Object.CalleeSavedRegister = Reg; + Object.CalleeSavedRegister = std::move(Reg); Object.CalleeSavedRestored = CSInfo.isRestored(); } else { auto &Object = YMF.StackObjects[StackObjectsIdx[FrameIdx]]; - Object.CalleeSavedRegister = Reg; + Object.CalleeSavedRegister = std::move(Reg); Object.CalleeSavedRestored = CSInfo.isRestored(); } } @@ -576,7 +576,7 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF, printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI); YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg); } - YMF.CallSitesInfo.push_back(YmlCS); + YMF.CallSitesInfo.push_back(std::move(YmlCS)); } // Sort call info by position of call instructions. @@ -597,7 +597,7 @@ void MIRPrinter::convertMachineMetadataNodes(yaml::MachineFunction &YMF, std::string NS; raw_string_ostream StrOS(NS); MD.second->print(StrOS, MST, MF.getFunction().getParent()); - YMF.MachineMetadataNodes.push_back(NS); + YMF.MachineMetadataNodes.push_back(std::move(NS)); } } @@ -612,7 +612,7 @@ void MIRPrinter::convertCalledGlobals(yaml::MachineFunction &YMF, yaml::CalledGlobal YamlCG{CallSite, CG.Callee->getName().str(), CG.TargetFlags}; - YMF.CalledGlobals.push_back(YamlCG); + YMF.CalledGlobals.push_back(std::move(YamlCG)); } // Sort by position of call instructions. @@ -638,11 +638,11 @@ void MIRPrinter::convert(yaml::MachineFunction &MF, yaml::MachineConstantPoolValue YamlConstant; YamlConstant.ID = ID++; - YamlConstant.Value = Str; + YamlConstant.Value = std::move(Str); YamlConstant.Alignment = Constant.getAlign(); YamlConstant.IsTargetSpecific = Constant.isMachineConstantPoolEntry(); - MF.Constants.push_back(YamlConstant); + MF.Constants.push_back(std::move(YamlConstant)); } } @@ -661,7 +661,7 @@ void MIRPrinter::convert(ModuleSlotTracker &MST, Entry.Blocks.push_back(Str); Str.clear(); } - YamlJTI.Entries.push_back(Entry); + YamlJTI.Entries.push_back(std::move(Entry)); } } diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 7d504ef5a0482..6e0342a763d15 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -967,13 +967,13 @@ void MachineFunction::copyAdditionalCallInfo(const MachineInstr *Old, CallSiteInfoMap::iterator CSIt = getCallSiteInfo(OldCallMI); if (CSIt != CallSitesInfo.end()) { CallSiteInfo CSInfo = CSIt->second; - CallSitesInfo[New] = CSInfo; + CallSitesInfo[New] = std::move(CSInfo); } CalledGlobalsMap::iterator CGIt = CalledGlobalsInfo.find(OldCallMI); if (CGIt != CalledGlobalsInfo.end()) { CalledGlobalInfo CGInfo = CGIt->second; - CalledGlobalsInfo[New] = CGInfo; + CalledGlobalsInfo[New] = std::move(CGInfo); } } @@ -991,14 +991,14 @@ void MachineFunction::moveAdditionalCallInfo(const MachineInstr *Old, if (CSIt != CallSitesInfo.end()) { CallSiteInfo CSInfo = std::move(CSIt->second); CallSitesInfo.erase(CSIt); - CallSitesInfo[New] = CSInfo; + CallSitesInfo[New] = std::move(CSInfo); } CalledGlobalsMap::iterator CGIt = CalledGlobalsInfo.find(OldCallMI); if (CGIt != CalledGlobalsInfo.end()) { CalledGlobalInfo CGInfo = std::move(CGIt->second); CalledGlobalsInfo.erase(CGIt); - CalledGlobalsInfo[New] = CGInfo; + CalledGlobalsInfo[New] = std::move(CGInfo); } } From 5ecc86bbcaebea5e7e480a3b2a5c4327f204bf3b Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Sun, 9 Feb 2025 10:35:14 +0100 Subject: [PATCH 086/293] [ValueTracking] test trunc to i1 as condition in dominating condition. (NFC) --- .../test/Transforms/InstCombine/known-bits.ll | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index cbd9737415f1f..a3872fefecf3b 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -2161,6 +2161,158 @@ define i1 @mul_nuw_nsw_nonneg_cant_be_one_commuted(i8 %x, i8 %y) { ret i1 %cmp } +define i8 @test_trunc_and_1(i8 %a) { +; CHECK-LABEL: @test_trunc_and_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAST:%.*]] = trunc i8 [[A:%.*]] to i1 +; CHECK-NEXT: br i1 [[CAST]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[B:%.*]] = and i8 [[A]], 1 +; CHECK-NEXT: ret i8 [[B]] +; CHECK: if.else: +; CHECK-NEXT: [[C:%.*]] = and i8 [[A]], 1 +; CHECK-NEXT: ret i8 [[C]] +; +entry: + %cast = trunc i8 %a to i1 + br i1 %cast, label %if.then, label %if.else + +if.then: + %b = and i8 %a, 1 + ret i8 %b + +if.else: + %c = and i8 %a, 1 + ret i8 %c +} + +define i8 @test_not_trunc_and_1(i8 %a) { +; CHECK-LABEL: @test_not_trunc_and_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAST:%.*]] = trunc i8 [[A:%.*]] to i1 +; CHECK-NEXT: br i1 [[CAST]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[B:%.*]] = and i8 [[A]], 1 +; CHECK-NEXT: ret i8 [[B]] +; CHECK: if.else: +; CHECK-NEXT: [[C:%.*]] = and i8 [[A]], 1 +; CHECK-NEXT: ret i8 [[C]] +; +entry: + %cast = trunc i8 %a to i1 + %not = xor i1 %cast, true + br i1 %not, label %if.then, label %if.else + +if.then: + %b = and i8 %a, 1 + ret i8 %b + +if.else: + %c = and i8 %a, 1 + ret i8 %c +} + +define i8 @neg_test_trunc_or_2(i8 %a) { +; CHECK-LABEL: @neg_test_trunc_or_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAST:%.*]] = trunc i8 [[A:%.*]] to i1 +; CHECK-NEXT: br i1 [[CAST]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[B:%.*]] = or i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; CHECK: if.else: +; CHECK-NEXT: [[C:%.*]] = or i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[C]] +; +entry: + %cast = trunc i8 %a to i1 + br i1 %cast, label %if.then, label %if.else + +if.then: + %b = or i8 %a, 2 + ret i8 %b + +if.else: + %c = or i8 %a, 2 + ret i8 %c +} + +define i8 @test_trunc_nuw_and_1(i8 %a) { +; CHECK-LABEL: @test_trunc_nuw_and_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAST:%.*]] = trunc nuw i8 [[A:%.*]] to i1 +; CHECK-NEXT: br i1 [[CAST]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[B:%.*]] = and i8 [[A]], 1 +; CHECK-NEXT: ret i8 [[B]] +; CHECK: if.else: +; CHECK-NEXT: [[C:%.*]] = and i8 [[A]], 1 +; CHECK-NEXT: ret i8 [[C]] +; +entry: + %cast = trunc nuw i8 %a to i1 + br i1 %cast, label %if.else, label %if.then + +if.then: + %b = and i8 %a, 1 + ret i8 %b + +if.else: + %c = and i8 %a, 1 + ret i8 %c +} + +define i8 @test_trunc_nuw_or_2(i8 %a) { +; CHECK-LABEL: @test_trunc_nuw_or_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAST:%.*]] = trunc nuw i8 [[A:%.*]] to i1 +; CHECK-NEXT: br i1 [[CAST]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[B:%.*]] = or i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[B]] +; CHECK: if.else: +; CHECK-NEXT: [[C:%.*]] = or i8 [[A]], 2 +; CHECK-NEXT: ret i8 [[C]] +; +entry: + %cast = trunc nuw i8 %a to i1 + br i1 %cast, label %if.else, label %if.then + +if.then: + %b = or i8 %a, 2 + ret i8 %b + +if.else: + %c = or i8 %a, 2 + ret i8 %c +} + +define i8 @test_not_trunc_nuw_and_1(i8 %a) { +; CHECK-LABEL: @test_not_trunc_nuw_and_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAST:%.*]] = trunc nuw i8 [[A:%.*]] to i1 +; CHECK-NEXT: br i1 [[CAST]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[B:%.*]] = and i8 [[A]], 1 +; CHECK-NEXT: ret i8 [[B]] +; CHECK: if.else: +; CHECK-NEXT: [[C:%.*]] = and i8 [[A]], 1 +; CHECK-NEXT: ret i8 [[C]] +; +entry: + %cast = trunc nuw i8 %a to i1 + %not = xor i1 %cast, true + br i1 %not, label %if.then, label %if.else + +if.then: + %b = and i8 %a, 1 + ret i8 %b + +if.else: + %c = and i8 %a, 1 + ret i8 %c +} + declare void @dummy() declare void @use(i1) declare void @sink(i8) From 32c4493d5f8164ebe9d3d3e01ca744e6c3afcf17 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 9 Feb 2025 11:20:19 +0000 Subject: [PATCH 087/293] [VPlan] Add incoming values for all predecessor to ResumePHI (NFCI). Follow-up as discussed when using VPInstruction::ResumePhi for all resume values (#112147). This patch explicitly adds incoming values for each predecessor in VPlan. This simplifies codegen and allows transformations adjusting the predecessors of blocks with NFC modulo incoming block order in phis. --- .../Transforms/Vectorize/LoopVectorize.cpp | 23 +++- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 17 +-- .../AArch64/conditional-branches-cost.ll | 10 +- .../AArch64/deterministic-type-shrinkage.ll | 2 +- .../AArch64/divs-with-scalable-vfs.ll | 6 +- .../AArch64/epilog-vectorization-factors.ll | 6 +- .../epilog-vectorization-widen-inductions.ll | 16 +-- .../AArch64/force-target-instruction-cost.ll | 6 +- .../AArch64/induction-costs-sve.ll | 32 ++--- .../LoopVectorize/AArch64/induction-costs.ll | 6 +- ...interleave-allocsize-not-equal-typesize.ll | 2 +- .../AArch64/interleaving-load-store.ll | 4 +- .../AArch64/interleaving-reduction.ll | 4 +- .../LoopVectorize/AArch64/intrinsiccost.ll | 12 +- .../AArch64/loop-vectorization-factors.ll | 10 +- .../AArch64/low_trip_count_predicates.ll | 6 +- .../partial-reduce-dot-product-epilogue.ll | 10 +- .../AArch64/pr60831-sve-inv-store-crash.ll | 2 +- .../AArch64/simple_early_exit.ll | 4 +- .../LoopVectorize/AArch64/store-costs-sve.ll | 6 +- .../sve-epilog-vect-inloop-reductions.ll | 4 +- .../AArch64/sve-epilog-vect-reductions.ll | 4 +- .../sve-epilog-vect-strict-reductions.ll | 4 +- .../LoopVectorize/AArch64/sve-epilog-vect.ll | 16 +-- .../LoopVectorize/AArch64/sve-fneg.ll | 2 +- .../AArch64/sve-interleaved-accesses.ll | 6 +- .../LoopVectorize/AArch64/sve-multiexit.ll | 4 +- .../sve-runtime-check-size-based-threshold.ll | 2 +- .../AArch64/sve-vector-reverse.ll | 2 +- .../AArch64/sve2-histcnt-epilogue.ll | 2 +- .../AArch64/sve2-histcnt-too-many-deps.ll | 2 +- .../LoopVectorize/AArch64/sve2-histcnt.ll | 2 +- .../ARM/mve-gather-scatter-tailpred.ll | 2 +- .../ARM/mve-hoist-runtime-checks.ll | 2 +- .../LoopVectorize/ARM/mve-multiexit.ll | 4 +- .../LoopVectorize/PowerPC/exit-branch-cost.ll | 6 +- .../PowerPC/optimal-epilog-vectorization.ll | 12 +- .../RISCV/blocks-with-dead-instructions.ll | 2 +- .../LoopVectorize/RISCV/dead-ops-cost.ll | 4 +- .../LoopVectorize/RISCV/induction-costs.ll | 4 +- .../RISCV/masked_gather_scatter.ll | 4 +- .../LoopVectorize/RISCV/strided-accesses.ll | 20 +-- .../RISCV/type-info-cache-evl-crash.ll | 2 +- ...-force-tail-with-evl-bin-unary-ops-args.ll | 36 +++--- ...ize-force-tail-with-evl-call-intrinsics.ll | 18 +-- ...ize-force-tail-with-evl-cast-intrinsics.ll | 20 +-- ...-force-tail-with-evl-intermediate-store.ll | 16 +-- .../vf-will-not-generate-any-vector-insts.ll | 2 +- .../LoopVectorize/X86/conversion-cost.ll | 2 +- .../LoopVectorize/X86/cost-model.ll | 10 +- .../X86/divs-with-tail-folding.ll | 4 +- .../X86/epilog-vectorization-inductions.ll | 6 +- .../LoopVectorize/X86/float-induction-x86.ll | 8 +- .../LoopVectorize/X86/gather_scatter.ll | 8 +- .../illegal-parallel-loop-uniform-write.ll | 2 +- .../LoopVectorize/X86/induction-costs.ll | 16 +-- .../LoopVectorize/X86/interleave-cost.ll | 4 +- .../LoopVectorize/X86/intrinsiccost.ll | 12 +- .../X86/invariant-load-gather.ll | 2 +- .../X86/invariant-store-vectorization.ll | 8 +- .../X86/limit-vf-by-tripcount.ll | 6 +- .../LoopVectorize/X86/masked-store-cost.ll | 2 +- .../LoopVectorize/X86/masked_load_store.ll | 42 +++--- .../LoopVectorize/X86/multi-exit-cost.ll | 4 +- .../Transforms/LoopVectorize/X86/pr23997.ll | 2 +- .../Transforms/LoopVectorize/X86/pr35432.ll | 4 +- .../Transforms/LoopVectorize/X86/pr36524.ll | 4 +- .../Transforms/LoopVectorize/X86/pr47437.ll | 2 +- .../Transforms/LoopVectorize/X86/pr54634.ll | 2 +- ...ond-optimization-epilogue-vectorization.ll | 2 +- .../Transforms/LoopVectorize/X86/pr72969.ll | 6 +- .../LoopVectorize/X86/scatter_crash.ll | 8 +- .../LoopVectorize/X86/strided_load_cost.ll | 8 +- .../LoopVectorize/X86/uniform_mem_op.ll | 2 +- .../X86/vect.omp.force.small-tc.ll | 2 +- .../X86/vectorize-force-tail-with-evl.ll | 2 +- .../LoopVectorize/dead_instructions.ll | 2 +- .../LoopVectorize/epilog-iv-select-cmp.ll | 8 +- .../epilog-vectorization-any-of-reductions.ll | 18 +-- .../epilog-vectorization-reductions.ll | 22 ++-- ...log-vectorization-trunc-induction-steps.ll | 2 +- .../first-order-recurrence-complex.ll | 8 +- llvm/test/Transforms/LoopVectorize/fpsat.ll | 4 +- .../LoopVectorize/if-conversion-nest.ll | 4 +- .../LoopVectorize/if-pred-non-void.ll | 6 +- .../Transforms/LoopVectorize/induction.ll | 122 +++++++++--------- .../LoopVectorize/interleaved-accesses-3.ll | 4 +- .../LoopVectorize/interleaved-accesses.ll | 6 +- .../invariant-store-vectorization-2.ll | 8 +- .../invariant-store-vectorization.ll | 12 +- .../LoopVectorize/load-deref-pred-align.ll | 4 +- .../multiple-strides-vectorization.ll | 4 +- ...o-fold-tail-by-masking-iv-external-uses.ll | 4 +- .../LoopVectorize/no_outside_user.ll | 2 +- .../Transforms/LoopVectorize/opaque-ptr.ll | 8 +- .../optimal-epilog-vectorization-liveout.ll | 2 +- .../optimal-epilog-vectorization.ll | 26 ++-- llvm/test/Transforms/LoopVectorize/optsize.ll | 8 +- .../pointer-select-runtime-checks.ll | 10 +- .../pr30654-phiscev-sext-trunc.ll | 12 +- llvm/test/Transforms/LoopVectorize/pr37248.ll | 4 +- llvm/test/Transforms/LoopVectorize/pr45259.ll | 2 +- ...pr47343-expander-lcssa-after-cfg-update.ll | 2 +- llvm/test/Transforms/LoopVectorize/pr50686.ll | 2 +- .../pr59319-loop-access-info-invalidation.ll | 4 +- .../LoopVectorize/reduction-align.ll | 4 +- .../LoopVectorize/reverse_induction.ll | 6 +- .../runtime-check-needed-but-empty.ll | 2 +- .../runtime-check-small-clamped-bounds.ll | 8 +- .../Transforms/LoopVectorize/runtime-check.ll | 4 +- ...ntime-checks-difference-simplifications.ll | 4 +- .../LoopVectorize/runtime-checks-hoist.ll | 24 ++-- .../scev-exit-phi-invalidation.ll | 24 ++-- .../LoopVectorize/scev-predicate-reasoning.ll | 6 +- .../LoopVectorize/select-cmp-multiuse.ll | 18 +-- .../LoopVectorize/single_early_exit.ll | 4 +- .../LoopVectorize/skeleton-lcssa-crash.ll | 2 +- .../version-stride-with-integer-casts.ll | 22 ++-- .../AArch64/indvars-vectorization.ll | 2 +- .../AArch64/matrix-extract-insert.ll | 8 +- .../X86/pr48844-br-to-switch-vectorization.ll | 2 +- .../test/Transforms/PhaseOrdering/X86/vdiv.ll | 2 +- 122 files changed, 520 insertions(+), 510 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index dacee6445072a..4c0bed6fedf5b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2507,6 +2507,15 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { } VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH); PreVectorPH->swapSuccessors(); + + // We just connected a new block to the scalar preheader. Update all + // ResumePhis by adding an incoming value for it. + for (VPRecipeBase &R : *cast(ScalarPH)) { + auto *ResumePhi = dyn_cast(&R); + if (!ResumePhi || ResumePhi->getOpcode() != VPInstruction::ResumePhi) + continue; + ResumePhi->addOperand(ResumePhi->getOperand(1)); + } } void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { @@ -7642,8 +7651,8 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( // over the incoming values correctly. using namespace VPlanPatternMatch; auto IsResumePhi = [](VPUser *U) { - return match( - U, m_VPInstruction(m_VPValue(), m_VPValue())); + auto *VPI = dyn_cast(U); + return VPI && VPI->getOpcode() == VPInstruction::ResumePhi; }; assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 && "ResumePhi must have a single user"); @@ -7747,12 +7756,20 @@ DenseMap LoopVectorizationPlanner::executePlan( if (VectorizingEpilogue) { assert(!ILV.Legal->hasUncountableEarlyExit() && "Epilogue vectorisation not yet supported with early exits"); + BasicBlock *PH = OrigLoop->getLoopPreheader(); BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); + for (auto *Pred : predecessors(PH)) { + for (PHINode &Phi : PH->phis()) { + if (Phi.getBasicBlockIndex(Pred) != -1) + continue; + Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred); + } + } + for (VPRecipeBase &R : *MiddleVPBB) { fixReductionScalarResumeWhenVectorizingEpilog( &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); } - BasicBlock *PH = OrigLoop->getLoopPreheader(); for (const auto &[IVPhi, _] : Legal->getInductionVars()) { auto *Inc = cast(IVPhi->getIncomingValueForBlock(PH)); Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index bc80c5ea0b1b2..08d7338da87bc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -685,20 +685,13 @@ Value *VPInstruction::generate(VPTransformState &State) { return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags()); } case VPInstruction::ResumePhi: { - Value *IncomingFromVPlanPred = - State.get(getOperand(0), /* IsScalar */ true); - Value *IncomingFromOtherPreds = - State.get(getOperand(1), /* IsScalar */ true); auto *NewPhi = Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name); - BasicBlock *VPlanPred = - State.CFG - .VPBB2IRBB[cast(getParent()->getPredecessors()[0])]; - NewPhi->addIncoming(IncomingFromVPlanPred, VPlanPred); - for (auto *OtherPred : predecessors(Builder.GetInsertBlock())) { - if (OtherPred == VPlanPred) - continue; - NewPhi->addIncoming(IncomingFromOtherPreds, OtherPred); + for (const auto &[IncVPV, PredVPBB] : + zip(operands(), getParent()->getPredecessors())) { + Value *IncV = State.get(IncVPV, /* IsScalar */ true); + BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(cast(PredVPBB)); + NewPhi->addIncoming(IncV, PredBB); } return NewPhi; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index caa98d766a8c3..754b86ab2fb87 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -369,7 +369,7 @@ define void @latch_branch_cost(ptr %dst) { ; DEFAULT: vec.epilog.middle.block: ; DEFAULT-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; DEFAULT-NEXT: br label [[FOR_BODY:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -588,7 +588,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; DEFAULT-NEXT: br label [[LOOP_HEADER:%.*]] ; DEFAULT: loop.header: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] @@ -715,7 +715,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; PRED-NEXT: br label [[LOOP_HEADER:%.*]] ; PRED: loop.header: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] @@ -1301,7 +1301,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; DEFAULT-NEXT: br label [[LOOP_HEADER:%.*]] ; DEFAULT: loop.header: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] @@ -1523,7 +1523,7 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[LOOP_HEADER:%.*]] ; PRED: loop.header: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll index 84e4e29fd63c6..c2502aac5b61d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -83,7 +83,7 @@ define void @test_pr25490(i32 %n, ptr noalias nocapture %a, ptr noalias nocaptur ; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N_VEC5]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll index 2c37593be7861..ecf49a31ba79f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll @@ -52,7 +52,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -162,7 +162,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] @@ -289,7 +289,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll index 898e515de0fe3..4a2de58938043 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll @@ -85,7 +85,7 @@ define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture no ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -203,7 +203,7 @@ define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture n ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -321,7 +321,7 @@ define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture n ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC10]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll index 88b14b18c1588..6e26cfa17a4da 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll @@ -73,8 +73,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START_1]], [[ITER_CHECK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START_1]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -165,7 +165,7 @@ define void @test_widen_induction(ptr %A, i64 %N) { ; CHECK-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N9]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] @@ -258,7 +258,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) { ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK:%.*]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] @@ -344,8 +344,8 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) { ; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[IND_END]] ; CHECK-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] @@ -431,7 +431,7 @@ define void @test_widen_extended_induction(ptr %dst) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -509,7 +509,7 @@ define void @test_widen_truncated_induction(ptr %A) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll index 8c5d84e6981bc..987d18c427aef 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll @@ -116,8 +116,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) { ; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[START]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N11]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END1]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ], [ [[IND_END2]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END1]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END2]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START]], %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ] @@ -284,7 +284,7 @@ define void @test_exit_branch_cost(ptr %dst, ptr noalias %x.ptr, ptr noalias %y. ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index 56a468ed1310b..62186d5a20989 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -70,7 +70,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -146,7 +146,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -238,7 +238,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; DEFAULT-NEXT: br label [[FOR_BODY:%.*]] ; DEFAULT: for.body: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -318,7 +318,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[FOR_BODY:%.*]] ; PRED: for.body: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -407,8 +407,8 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; DEFAULT-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] @@ -513,8 +513,8 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; PRED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] @@ -605,8 +605,8 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] @@ -710,8 +710,8 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; PRED-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] @@ -786,8 +786,8 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; DEFAULT: scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; DEFAULT-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] @@ -852,8 +852,8 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; PRED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index f9cc195e36702..5084f4c48bd0e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -263,7 +263,7 @@ define void @wide_truncated_iv(ptr %dst) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 200, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 192, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 200, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ] @@ -416,8 +416,8 @@ define void @zext_iv_increment(ptr %dst, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll index bd77f9779b680..79d7ab84b3a0f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll @@ -54,7 +54,7 @@ define void @pr58722_load_interleave_group(ptr %src, ptr %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll index 24ff9c67f80e3..8320608d67588 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll @@ -103,7 +103,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-4-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]] ; INTERLEAVE-4-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; INTERLEAVE-4: vec.epilog.scalar.ph: -; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; INTERLEAVE-4-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE-4: loop: ; INTERLEAVE-4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -198,7 +198,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-2-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[N]], [[N_VEC8]] ; INTERLEAVE-2-NEXT: br i1 [[CMP_N9]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; INTERLEAVE-2: vec.epilog.scalar.ph: -; INTERLEAVE-2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; INTERLEAVE-2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; INTERLEAVE-2-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE-2: loop: ; INTERLEAVE-2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll index edb58e2d8f12f..07aa57d329c7d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll @@ -79,8 +79,8 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[N]], [[N_VEC11]] ; INTERLEAVE-4-NEXT: br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; INTERLEAVE-4: vec.epilog.scalar.ph: -; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i32 [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ] +; INTERLEAVE-4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; INTERLEAVE-4-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i32 [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; INTERLEAVE-4-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE-4: loop: ; INTERLEAVE-4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll index 9dceb0167a4ac..1921e5f193aa3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -87,9 +87,9 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-NEXT: [[CMP_N24:%.*]] = icmp eq i64 [[N_VEC6]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N24]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PSRC]], [[ITER_CHECK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi ptr [ [[IND_END12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PDST]], [[ITER_CHECK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi ptr [ [[IND_END12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] @@ -202,9 +202,9 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: [[CMP_N21:%.*]] = icmp eq i64 [[N_VEC5]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N21]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ], [ [[IND_END7]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi ptr [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PSRC]], [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PDST]], [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END7]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi ptr [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index 65de254c03ca2..4c54796d9d16d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -63,7 +63,7 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -166,7 +166,7 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -338,7 +338,7 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -539,7 +539,7 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N13]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -691,7 +691,7 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N13]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll index dc844f312f436..9b7e41aa98db6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -127,7 +127,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef ; CHECK-VS1-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] ; CHECK-VS1-NEXT: br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK-VS1: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-VS1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-VS1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ] ; CHECK-VS1-NEXT: br label %[[WHILE_BODY:.*]] ; CHECK-VS1: [[WHILE_BODY]]: ; CHECK-VS1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ] @@ -235,7 +235,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef ; CHECK-VS2-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] ; CHECK-VS2-NEXT: br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK-VS2: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-VS2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-VS2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ] ; CHECK-VS2-NEXT: br label %[[WHILE_BODY:.*]] ; CHECK-VS2: [[WHILE_BODY]]: ; CHECK-VS2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ] @@ -453,7 +453,7 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef % ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[WHILE_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[WHILE_BODY:.*]] ; CHECK: [[WHILE_BODY]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index bd9a0fafb1e1c..8cf3ec847b35c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -74,8 +74,8 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -183,9 +183,9 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[WHILE_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IDX_NEG]], [[ITER_CHECK:%.*]] ], [ [[IND_END6]], [[WHILE_BODY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IV_NEXT]], [[WHILE_BODY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[ADD]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[WHILE_BODY]] ], [ [[IDX_NEG]], [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[ADD]], [[WHILE_BODY]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY1:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[IV_NEG:%.*]] = phi i64 [ [[IV_NEG_NEXT:%.*]], [[WHILE_BODY1]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll index d95cb6be0e858..72a1bb2c00e54 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll @@ -181,7 +181,7 @@ define void @test_loop2(i64 %n, ptr %dst) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index b439b64e829e5..3982ed6dd26ab 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -501,8 +501,8 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index 1d4b808a612a0..cd71a1c2c04d2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -70,7 +70,7 @@ define void @cost_store_i8(ptr %dst) #0 { ; DEFAULT-NEXT: [[CMP_N4:%.*]] = icmp eq i64 101, [[N_VEC3]] ; DEFAULT-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -207,7 +207,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT: vec.epilog.middle.block: ; DEFAULT-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -268,7 +268,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED: middle.block: ; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll index 6dda3de0d7b0b..754f97c21608a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll @@ -71,8 +71,8 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) { ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1, [[ITER_CHECK]] ], [ [[BIN_RDX]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[BIN_RDX]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll index c2b32d87c7a16..9d06363846254 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll @@ -71,8 +71,8 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 5, [[ITER_CHECK]] ], [ [[TMP21]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 5, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll index 5c90ee3a1bc88..af678d7a20f42 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll @@ -66,8 +66,8 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[N]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi float [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi float [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll index 295c0655a4b4d..b7eddd7fdbccf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -82,7 +82,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 1024, [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -145,7 +145,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-VF8: vec.epilog.middle.block: ; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-VF8: vec.epilog.scalar.ph: -; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF8: for.body: ; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -239,7 +239,7 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -302,7 +302,7 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-VF8: vec.epilog.middle.block: ; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-VF8: vec.epilog.scalar.ph: -; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF8: for.body: ; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -397,8 +397,8 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 10000, [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -464,8 +464,8 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-VF8: vec.epilog.middle.block: ; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-VF8: vec.epilog.scalar.ph: -; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-VF8-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK]] ], [ [[IND_END1]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-VF8-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END1]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK]] ] ; CHECK-VF8-NEXT: br label [[LOOP:%.*]] ; CHECK-VF8: loop: ; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll index a8122849c203c..08d0fb77e456d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll @@ -60,7 +60,7 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index bf95622733461..d91b467a44865 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1492,9 +1492,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[TMP33:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll index c20be943a2ccf..6e01a5232d1a1 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll @@ -59,7 +59,7 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ] @@ -150,7 +150,7 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll index 1373266497c61..feb27caf305a2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll @@ -90,7 +90,7 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll index d55ef855604bd..9567123ab8ae3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -153,7 +153,7 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[N]], [[VECTOR_MEMCHECK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll index c74ceecf7cfe6..84fc963833cf2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll @@ -66,7 +66,7 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[N_MOD_VF2]], 0 ; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_EXIT]], label [[SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll index d4c144ebe5dfb..c430e72cea703 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll @@ -99,7 +99,7 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe ; NORMAL_DEP_LIMIT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; NORMAL_DEP_LIMIT-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; NORMAL_DEP_LIMIT: scalar.ph: -; NORMAL_DEP_LIMIT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY1:%.*]] ] +; NORMAL_DEP_LIMIT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; NORMAL_DEP_LIMIT-NEXT: br label [[FOR_BODY1:%.*]] ; NORMAL_DEP_LIMIT: for.body: ; NORMAL_DEP_LIMIT-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 30ee30b595fcb..3b00312959d8a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -779,7 +779,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll index b629dfd4fce9f..fda9ef2cf6c2f 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll @@ -353,7 +353,7 @@ define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocaptur ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll index de974d267f9d2..6e8bef26d3e83 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll @@ -82,7 +82,7 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[J_021_US:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_US:%.*]], [[INNER_LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-multiexit.ll index cc2fbb1b0df79..789a97c052a96 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-multiexit.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-multiexit.ll @@ -42,7 +42,7 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ] @@ -116,7 +116,7 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll index 2f1af7951dbc2..e5717c4f1d91a 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll @@ -144,9 +144,9 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: [[CMP_N33:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]] ; CHECK-NEXT: br i1 [[CMP_N33]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL34:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX35:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], %[[ITER_CHECK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL34:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX35:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL34]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll index 32d62befe9fc5..ba9d49fc682c4 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll @@ -120,7 +120,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; VF-TWO-CHECK-NEXT: [[CMP_N19:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC18]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N19]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: -; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] ; VF-TWO-CHECK: for.body: ; VF-TWO-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -252,7 +252,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; VF-FOUR-CHECK-NEXT: [[CMP_N19:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC18]] ; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N19]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-FOUR-CHECK: vec.epilog.scalar.ph: -; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-FOUR-CHECK-NEXT: br label [[FOR_BODY:%.*]] ; VF-FOUR-CHECK: for.body: ; VF-FOUR-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -438,8 +438,8 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; VF-TWO-CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: -; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ] +; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] +; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] ; VF-TWO-CHECK: for.body: ; VF-TWO-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -597,8 +597,8 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; VF-FOUR-CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]] ; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-FOUR-CHECK: vec.epilog.scalar.ph: -; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ] +; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] +; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL19:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END18]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-FOUR-CHECK-NEXT: br label [[FOR_BODY:%.*]] ; VF-FOUR-CHECK: for.body: ; VF-FOUR-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll index f2318d6057eec..50dfb689c2060 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll @@ -874,7 +874,7 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 921313ba53818..3839b367ae08c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -134,7 +134,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -257,7 +257,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll index e4425a9327385..2582882baba00 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll @@ -91,8 +91,8 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ], [ [[X_I64]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ], [ [[X_I32]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X_I64]], %[[ENTRY]] ], [ [[X_I64]], %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i32 [ [[IND_END22]], %[[MIDDLE_BLOCK]] ], [ [[X_I32]], %[[ENTRY]] ], [ [[X_I32]], %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index dc63072aa795f..2276b592aac8a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -69,7 +69,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]] ; RV32-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; RV32: scalar.ph: -; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; RV32-NEXT: br label [[FOR_BODY:%.*]] ; RV32: for.body: ; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -148,7 +148,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, [[N_VEC]] ; RV64-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; RV64: scalar.ph: -; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; RV64-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; RV64-NEXT: br label [[FOR_BODY:%.*]] ; RV64: for.body: ; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 30cb33e64eccf..4ea248254f2c6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -263,7 +263,7 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; NOSTRIDED: scalar.ph: -; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: loop: ; NOSTRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -346,8 +346,8 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; NOSTRIDED: scalar.ph: -; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; NOSTRIDED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: loop: ; NOSTRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -480,7 +480,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; NOSTRIDED: scalar.ph: -; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: loop: ; NOSTRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -554,7 +554,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; STRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; STRIDED: scalar.ph: -; STRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; STRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; STRIDED-NEXT: br label [[LOOP:%.*]] ; STRIDED: loop: ; STRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -623,8 +623,8 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; NOSTRIDED: scalar.ph: -; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; NOSTRIDED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; NOSTRIDED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; NOSTRIDED-NEXT: br label [[LOOP:%.*]] ; NOSTRIDED: loop: ; NOSTRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] @@ -776,9 +776,9 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; STRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; STRIDED: scalar.ph: -; STRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; STRIDED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[VECTOR_MEMCHECK]] ], [ [[P]], [[ENTRY]] ] -; STRIDED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[P2]], [[VECTOR_MEMCHECK]] ], [ [[P2]], [[ENTRY]] ] +; STRIDED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; STRIDED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY]] ], [ [[P]], [[VECTOR_MEMCHECK]] ] +; STRIDED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[P2]], [[ENTRY]] ], [ [[P2]], [[VECTOR_MEMCHECK]] ] ; STRIDED-NEXT: br label [[LOOP:%.*]] ; STRIDED: loop: ; STRIDED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll index 48b73c7f1a4de..7de51bc3a8a68 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll @@ -62,7 +62,7 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count) ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll index df9ca218aad70..13286a3394126 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll @@ -54,7 +54,7 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -148,7 +148,7 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -242,7 +242,7 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -336,7 +336,7 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -430,7 +430,7 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -524,7 +524,7 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -618,7 +618,7 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -712,7 +712,7 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -806,7 +806,7 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -900,7 +900,7 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -994,7 +994,7 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1088,7 +1088,7 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1182,7 +1182,7 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1279,7 +1279,7 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1374,7 +1374,7 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1469,7 +1469,7 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1564,7 +1564,7 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1712,7 +1712,7 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll index f07aaecfa8467..28ba5efb53c2a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll @@ -67,7 +67,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -182,7 +182,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -297,7 +297,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -412,7 +412,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -519,7 +519,7 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -619,7 +619,7 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -722,7 +722,7 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -831,7 +831,7 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -938,7 +938,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll index 78b9e19fb3966..4557e95f1e1b6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll @@ -59,7 +59,7 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -159,7 +159,7 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -259,7 +259,7 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -359,7 +359,7 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -459,7 +459,7 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -559,7 +559,7 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -659,7 +659,7 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -759,7 +759,7 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -859,7 +859,7 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -959,7 +959,7 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) { ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll index d4881bc50229c..733c05fd9259c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll @@ -71,8 +71,8 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: store i32 [[TMP23]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-OUTLOOP: scalar.ph: -; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ], [ [[START]], [[VECTOR_MEMCHECK]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -134,8 +134,8 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL-INLOOP: scalar.ph: -; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ], [ [[START]], [[VECTOR_MEMCHECK]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-INLOOP: for.body: ; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -191,8 +191,8 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NO-VP-OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; NO-VP-OUTLOOP: scalar.ph: -; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; NO-VP-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ], [ [[START]], [[VECTOR_MEMCHECK]] ] ; NO-VP-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; NO-VP-OUTLOOP: for.body: ; NO-VP-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -247,8 +247,8 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; NO-VP-INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NO-VP-INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; NO-VP-INLOOP: scalar.ph: -; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_MEMCHECK]] ], [ [[START]], [[ENTRY]] ] +; NO-VP-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; NO-VP-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ], [ [[START]], [[VECTOR_MEMCHECK]] ] ; NO-VP-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; NO-VP-INLOOP: for.body: ; NO-VP-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll index 4efc231c92a4d..e7fdfbcf76caa 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll @@ -33,7 +33,7 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[TMP3:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 15bdbea612a70..f50dffb9ddf15 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -68,7 +68,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, [[ITER_CHECK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index bd28e28ddff95..50539668b6f70 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -275,8 +275,8 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC8]] ; CHECK-NEXT: br i1 [[CMP_N14]], label [[LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX15:%.*]] = phi float [ [[TMP157]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ITER_CHECK]] ], [ [[TMP124]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX15:%.*]] = phi float [ [[TMP157]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP124]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ] @@ -387,8 +387,8 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_1_WIDE:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT_WIDE:%.*]], [[LOOP_LATCH:%.*]] ] @@ -832,7 +832,7 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC39]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll index f14422e0a6069..c861aa8172b9b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll @@ -46,7 +46,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -162,7 +162,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll index c14ddca6c913d..7614be0bd3fce 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -92,7 +92,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) { ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IV_START]], [[VECTOR_SCEVCHECK]] ], [ [[IV_START]], [[ITER_CHECK:%.*]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_SCEVCHECK]] ], [ [[IV_START]], [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -225,8 +225,8 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, ; CHECK-NEXT: [[CMP_N25:%.*]] = icmp eq i64 [[L]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N25]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL11:%.*]] = phi i16 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL11:%.*]] = phi i16 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index fc6059d036cd0..537fda42d3a1e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -84,8 +84,8 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[N_VEC3]], [[ZEXT]] ; AUTO_VEC-NEXT: br i1 [[CMP_N14]], label [[FOR_END]], label [[FOR_BODY]] ; AUTO_VEC: for.body: -; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[X_06:%.*]] = phi float [ [[CONV1:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END8]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] ; AUTO_VEC-NEXT: store float [[X_06]], ptr [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 @@ -468,8 +468,8 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC: for.cond.cleanup: ; AUTO_VEC-NEXT: ret void ; AUTO_VEC: for.body: -; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[ITER_CHECK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[P]], i64 [[INDVARS_IV]] ; AUTO_VEC-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP16]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll index 8c338d6a746c4..175f4f379146b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -705,8 +705,8 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt ; AVX512-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]] ; AVX512-NEXT: br i1 [[CMP_N17]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ] -; AVX512-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[ITER_CHECK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[ITER_CHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] @@ -801,8 +801,8 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt ; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]] ; FVW2: scalar.ph: -; FVW2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ] -; FVW2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ] +; FVW2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ] +; FVW2-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ] ; FVW2-NEXT: br label [[FOR_BODY:%.*]] ; FVW2: for.body: ; FVW2-NEXT: [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll index 304105fd9925a..5ba559af077ca 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -87,7 +87,7 @@ define void @foo(ptr nocapture %a, ptr nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ] +; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY3_US]] ; CHECK: for.end15.loopexit: ; CHECK-NEXT: br label [[FOR_END15]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 68cbfad91c541..b6e9dfc2322ff 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -74,7 +74,7 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 { ; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC9]] ; CHECK-NEXT: br i1 [[CMP_N20]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP1]] ] @@ -144,7 +144,7 @@ define void @multiple_truncated_ivs_with_wide_uses(i1 %c, ptr %A, ptr %B) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -347,11 +347,11 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 { ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP22]], i32 15 ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -12, [[MIDDLE_BLOCK]] ], [ 100, [[VECTOR_MEMCHECK]] ], [ 100, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[VECTOR_MEMCHECK]] ], [ 2048, [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[VECTOR_MEMCHECK]] ], [ [[A]], [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[B]], [[VECTOR_MEMCHECK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -12, [[MIDDLE_BLOCK]] ], [ 100, [[ENTRY:%.*]] ], [ 100, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 2048, [[ENTRY]] ], [ 2048, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ], [ [[A]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ], [ [[B]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ], [ [[B]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[LOOP]] ] @@ -541,7 +541,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index d18d618c6a447..17809b3caf210 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -203,7 +203,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -430,7 +430,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N) ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll index f50177e61ef08..8f9fb3dd63b8b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -93,9 +93,9 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-NEXT: [[CMP_N26:%.*]] = icmp eq i64 [[N_VEC8]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N26]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PSRC]], [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PDST]], [[ITER_CHECK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] @@ -212,9 +212,9 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[N_VEC7]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N23]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi ptr [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PSRC]], [[ITER_CHECK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi ptr [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PDST]], [[ITER_CHECK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi ptr [ [[IND_END10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi ptr [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll index e8e0608329d89..07f17ec8c2cfa 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -70,7 +70,7 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC7]] ; CHECK-NEXT: br i1 [[CMP_N16]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC7]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC7]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index adfffccb6bcac..64d3d4121d660 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -82,8 +82,8 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX19:%.*]] = phi i32 [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX19:%.*]] = phi i32 [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] @@ -210,7 +210,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC9]] ; CHECK-NEXT: br i1 [[CMP_N23]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] @@ -335,7 +335,7 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC17]] ; CHECK-NEXT: br i1 [[CMP_N28]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll index 50414cc29312c..8688b246c60f4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll @@ -99,7 +99,7 @@ define void @test_tc_18(ptr noalias %src, ptr noalias %dst) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 18, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 18, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] @@ -172,7 +172,7 @@ define void @test_tc_19(ptr noalias %src, ptr noalias %dst) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 18, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 18, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] @@ -257,7 +257,7 @@ define void @test_tc_20(ptr noalias %src, ptr noalias %dst) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll index f0a1e5c47d06a..7f0b6b2f9b4d7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll @@ -84,7 +84,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER1:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 7e42ffd0f44c3..206bbdf262b72 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -53,7 +53,7 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX1: middle.block: ; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -160,7 +160,7 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2: vec.epilog.middle.block: ; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] ; AVX2: vec.epilog.scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -267,7 +267,7 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -354,7 +354,7 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX1: middle.block: ; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -461,7 +461,7 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX2: vec.epilog.middle.block: ; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] ; AVX2: vec.epilog.scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -568,7 +568,7 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -665,7 +665,7 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX1: middle.block: ; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -778,7 +778,7 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2: vec.epilog.middle.block: ; AVX2-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] ; AVX2: vec.epilog.scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -891,7 +891,7 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1019,7 +1019,7 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX1: middle.block: ; AVX1-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1107,7 +1107,7 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX2: middle.block: ; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1222,7 +1222,7 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX512-NEXT: br label [[FOR_BODY1:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1347,7 +1347,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; AVX512: middle.block: ; AVX512-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1527,7 +1527,7 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2: middle.block: ; AVX2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[VECTOR_MEMCHECK]] ], [ 4095, [[ENTRY:%.*]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1639,7 +1639,7 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512: middle.block: ; AVX512-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[VECTOR_MEMCHECK]] ], [ 4095, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1806,7 +1806,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] ; AVX1-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX1: vec.epilog.scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX1-NEXT: br label [[FOR_BODY1:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1942,7 +1942,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] ; AVX2-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX2: vec.epilog.scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2078,7 +2078,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] ; AVX512-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: br label [[FOR_BODY1:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2259,7 +2259,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] ; AVX1-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX1: vec.epilog.scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX1-NEXT: br label [[FOR_BODY1:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2395,7 +2395,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] ; AVX2-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX2: vec.epilog.scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX2-NEXT: br label [[FOR_BODY1:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2531,7 +2531,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC9]] ; AVX512-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: br label [[FOR_BODY1:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll index 6c97ab362fc86..99d39f3e88983 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/multi-exit-cost.ll @@ -43,8 +43,8 @@ define i64 @test_value_in_exit_compare_chain_used_outside(ptr %src, i64 %x, i64 ; CHECK-NEXT: [[TMP31:%.*]] = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> [[TMP29]]) ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP31]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP31]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll index f7a3a1245c286..d00edeb1c9e6d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -75,7 +75,7 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC8]] ; CHECK-NEXT: br i1 [[CMP_N12]], label [[LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll index 1194e82325ef7..1845243d1278e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -83,8 +83,8 @@ define i32 @main(ptr %ptr) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ], [ [[DOTPROMOTED]], [[FOR_BODY8_LR_PH]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DOTPROMOTED]], [[FOR_BODY8_LR_PH]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY8:%.*]] ; CHECK: for.body8: ; CHECK-NEXT: [[INC5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY8]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll index 90b86bb3ffb07..ee8374f952c7a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll @@ -37,8 +37,8 @@ define void @foo(ptr %ptr, ptr %ptr.2) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 80, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 82, [[MIDDLE_BLOCK]] ], [ 2, [[VECTOR_MEMCHECK]] ], [ 2, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 80, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 82, [[MIDDLE_BLOCK]] ], [ 2, [[ENTRY]] ], [ 2, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index d8ec92124682a..7b1c7ae94ff41 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -307,7 +307,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]] ; AVX1-NEXT: br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX1: vec.epilog.scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; AVX1-NEXT: br label [[FOR_BODY1:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll index 1cee80f88ec62..23a6a1286a0f0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll @@ -92,7 +92,7 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N15]], label [[L44]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[L27:%.*]] ; CHECK: L26: ; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP27:%.*]], [[L27]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll index c317e877ee8a5..5476ff504edb3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll @@ -49,7 +49,7 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 36, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 32, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 36, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 32, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll index d1c0201ccb9a4..0cd746590e0f1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll @@ -85,9 +85,9 @@ define void @test(ptr %p) { ; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; VEC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; VEC: scalar.ph: -; VEC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ] -; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ] -; VEC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; VEC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ], [ 1, [[VECTOR_SCEVCHECK]] ] +; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ], [ 1, [[VECTOR_SCEVCHECK]] ] +; VEC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; VEC-NEXT: br label [[FOR_BODY:%.*]] ; VEC: for.body: ; VEC-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IDX:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll index a0294f7ac7992..58d3ead2d2919 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -109,8 +109,8 @@ define void @_Z3fn1v() #0 { ; CHECK-NEXT: [[CMP_N23:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC7]] ; CHECK-NEXT: br i1 [[CMP_N23]], label [[FOR_COND_CLEANUP_LOOPEXIT99]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 8, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ 8, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: iter.check23: ; CHECK-NEXT: [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9 @@ -210,8 +210,8 @@ define void @_Z3fn1v() #0 { ; CHECK-NEXT: [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]] ; CHECK-NEXT: br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH41]] ; CHECK: vec.epilog.scalar.ph41: -; CHECK-NEXT: [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 8, [[ITER_CHECK27]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ] -; CHECK-NEXT: [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ 0, [[ITER_CHECK27]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ] +; CHECK-NEXT: [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[ITER_CHECK27]] ] +; CHECK-NEXT: [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[ITER_CHECK27]] ] ; CHECK-NEXT: br label [[FOR_BODY_US:%.*]] ; CHECK: for.body.us: ; CHECK-NEXT: [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL56]], [[VEC_EPILOG_SCALAR_PH41]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll index d316befb9548d..ad8f1fb3ccd21 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -222,8 +222,8 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; CHECK-NEXT: [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY1]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] @@ -454,8 +454,8 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u ; MAX-BW-NEXT: [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]]) ; MAX-BW-NEXT: br i1 true, label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] ; MAX-BW: vec.epilog.scalar.ph: -; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ] -; MAX-BW-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ] +; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; MAX-BW-NEXT: [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP149]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; MAX-BW-NEXT: br label [[FOR_BODY1:%.*]] ; MAX-BW: for.cond.cleanup: ; MAX-BW-NEXT: [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], [[FOR_BODY1]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ [[TMP170]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index 26a9e179e3c53..45594b0335336 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -299,7 +299,7 @@ define void @uniform_copy(ptr %A, ptr %B) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index eb52420949f8d..90ba702ed232e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -82,7 +82,7 @@ define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll index a3b066ed82216..e37eae4c1f390 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll @@ -141,7 +141,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]] ; NO-VP-NEXT: br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]] ; NO-VP: vec.epilog.scalar.ph: -; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC10]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; NO-VP-NEXT: br label [[FOR_BODY:%.*]] ; NO-VP: for.body: ; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll index d19fe8416200e..e1fd07bf590c4 100644 --- a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll @@ -165,7 +165,7 @@ define void @dead_load_and_vector_pointer(ptr %a, ptr %b) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 128, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 06f0f05889116..5437c54409cd5 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -72,8 +72,8 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 3, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -178,8 +178,8 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N12]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 2, %[[ITER_CHECK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i64 [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll index 94593a7d9a81d..a829decb5231e 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll @@ -67,8 +67,8 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) { ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -166,8 +166,8 @@ define i32 @any_of_reduction_epilog_arg_as_start_value(ptr %src, i64 %N, i32 %st ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -274,9 +274,9 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i1 [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i1 [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ false, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -426,8 +426,8 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) { ; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC8]] ; CHECK-NEXT: br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, [[ITER_CHECK:%.*]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START]], [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX23:%.*]] = phi i1 [ [[RDX_SELECT22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ true, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RED:%.*]] = phi i1 [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll index 54489af8c9f12..4d260984bfe9d 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll @@ -58,8 +58,8 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 5, [[ITER_CHECK]] ], [ [[TMP5]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 5, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -151,8 +151,8 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi float [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi float [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] @@ -245,8 +245,8 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) { ; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP23]] to i32 ; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 256, [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX5:%.*]] = phi i32 [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 256, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX5:%.*]] = phi i32 [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] @@ -349,9 +349,9 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) { ; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N11]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX12:%.*]] = phi float [ [[TMP16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1.500000e+01, [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi float [ [[TMP17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1.000000e+01, [[ITER_CHECK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX12:%.*]] = phi float [ [[TMP16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.500000e+01, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX13:%.*]] = phi float [ [[TMP17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP7]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+01, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -452,8 +452,8 @@ define i32 @reduction_phi_start_val(ptr %A, i64 %N) { ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_COND]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i32 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[START_SUM]], [[ITER_CHECK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i32 [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START_SUM]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll index 06ca7b197279c..7a92d1a1c9ea5 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll @@ -63,7 +63,7 @@ define void @trunc_iv_steps_with_epilogue(ptr %A, i64 %N) { ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll index 253ecaca75be8..a97fcc9f437c4 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -852,8 +852,8 @@ define void @sink_dominance(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] @@ -935,8 +935,8 @@ define void @sink_dominance_2(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/fpsat.ll b/llvm/test/Transforms/LoopVectorize/fpsat.ll index 7df0a34c35b85..77c4e8d7c68bf 100644 --- a/llvm/test/Transforms/LoopVectorize/fpsat.ll +++ b/llvm/test/Transforms/LoopVectorize/fpsat.ll @@ -37,7 +37,7 @@ define void @signed(ptr %x, ptr %y, i32 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -113,7 +113,7 @@ define void @unsigned(ptr %x, ptr %y, i32 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll index ff5d45d3f7f0f..492eb091175e2 100644 --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -46,7 +46,7 @@ define i32 @foo(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -160,7 +160,7 @@ define i32 @multi_variable_if_nest(ptr nocapture %A, ptr nocapture %B, i32 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll index e0a93ce877358..19660063f509b 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -273,7 +273,7 @@ define void @test(ptr nocapture %asd, ptr nocapture %aud, ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: ; UNROLL-NO-VF-NEXT: ret void @@ -484,7 +484,7 @@ define void @test_scalar2scalar(ptr nocapture %asd, ptr nocapture %bsd) { ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: ; UNROLL-NO-VF-NEXT: ret void @@ -682,7 +682,7 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {; ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: ; UNROLL-NO-VF-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 4bbf1e8fcc76a..f6ca9ff2e7b2e 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -302,7 +302,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -362,7 +362,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -429,7 +429,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -500,7 +500,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -567,7 +567,7 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1593,7 +1593,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1656,7 +1656,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1733,7 +1733,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1812,7 +1812,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1900,7 +1900,7 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) { ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: br label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3406,9 +3406,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3466,14 +3466,14 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) ; IND-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; IND-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] -; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3540,9 +3540,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3614,9 +3614,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3682,9 +3682,9 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3786,9 +3786,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3854,9 +3854,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; IND-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ] -; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -3926,9 +3926,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -4003,9 +4003,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -4074,9 +4074,9 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -4357,7 +4357,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -4395,7 +4395,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; IND-NEXT: br label [[FOR_BODY:%.*]] ; IND: for.body: ; IND-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -4438,7 +4438,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; UNROLL-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL: for.body: ; UNROLL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -4487,7 +4487,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: ; UNROLL-NO-IC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -4528,7 +4528,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; INTERLEAVE-NEXT: br label [[FOR_BODY:%.*]] ; INTERLEAVE: for.body: ; INTERLEAVE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -6170,9 +6170,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ] @@ -6241,9 +6241,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] -; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: ; IND-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ] @@ -6316,9 +6316,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: ; UNROLL-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ] @@ -6397,9 +6397,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ] @@ -6472,9 +6472,9 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: ; INTERLEAVE-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_2_CONV:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll index 81e9494ea70a7..661e8eb666d54 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll @@ -67,8 +67,8 @@ define void @_Z4funcPjS_hh(ptr noalias nocapture readonly %a, ptr noalias nocapt ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X]], %[[VECTOR_SCEVCHECK]] ], [ [[X]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X]], %[[FOR_BODY_PREHEADER]] ], [ [[X]], %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT:.*]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]] diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll index abd91d33157e6..e8ae770ff4c0f 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -1508,9 +1508,9 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[TMP16:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll index e8ad6a38d742c..14ed4c91c8757 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -55,7 +55,7 @@ define void @inv_val_store_to_inv_address_conditional_diff_values_ic(ptr %a, i64 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -153,7 +153,7 @@ define void @inv_val_store_to_inv_address_conditional_inv(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -241,8 +241,8 @@ define i32 @variant_val_store_to_inv_address(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll index aef25a05ea124..d26362f0c162e 100644 --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -50,8 +50,8 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -122,7 +122,7 @@ define void @inv_val_store_to_inv_address(ptr %a, i64 %n, ptr %b) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -224,7 +224,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -401,8 +401,8 @@ define i32 @multiple_uniform_stores(ptr nocapture %var1, ptr nocapture readonly ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_INC8_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[ARRAYIDX5_PROMOTED]], [[VECTOR_MEMCHECK]] ], [ [[ARRAYIDX5_PROMOTED]], [[FOR_BODY3_LR_PH]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[ARRAYIDX5_PROMOTED]], [[FOR_BODY3_LR_PH]] ], [ [[ARRAYIDX5_PROMOTED]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[FOR_BODY3_LR_PH]] ], [ [[TMP4]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: ; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP22:%.*]], [[FOR_BODY3]] ] diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index cbc483fabc184..5fcd0de911f85 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -265,8 +265,8 @@ define i32 @loop_requires_scev_predicate(ptr %dest, i32 %end) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll index 851db7c44d51e..a0cd3c64f2d77 100644 --- a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -84,7 +84,7 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[DOTOUTER_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTOUTER_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[DOTINNER:%.*]] ; CHECK: .exit: ; CHECK-NEXT: ret void @@ -151,7 +151,7 @@ define void @Test(ptr nocapture %obj, i64 %z) #0 { ; CHECK-HOIST-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]] ; CHECK-HOIST-NEXT: br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]] ; CHECK-HOIST: scalar.ph: -; CHECK-HOIST-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[DOTOUTER_PREHEADER]] ] +; CHECK-HOIST-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTOUTER_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-HOIST-NEXT: br label [[DOTINNER:%.*]] ; CHECK-HOIST: .exit: ; CHECK-HOIST-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll index 66996316b47b7..5f3abd2d8dbd3 100644 --- a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll +++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll @@ -48,8 +48,8 @@ define i32 @test(ptr %arr, i64 %n) { ; CHECK-NEXT: [[IND_ESCAPE:%.*]] = sub i64 [[IND_END]], 1 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOAD_VAL:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[CONV:%.*]] = phi i64 [ [[CONV2:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll index 24e60d45e1671..b3198587e5ae1 100644 --- a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll @@ -687,7 +687,7 @@ define i32 @sum_arrays_outside_use(ptr %B, ptr %A, ptr %C, i32 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[F1_EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[VECTOR_MEMCHECK]] ], [ [[B_PROMOTED]], %[[BB]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[B_PROMOTED]], %[[BB]] ], [ [[B_PROMOTED]], %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[DOTLR_PH_I:.*]] ; CHECK: [[_LR_PH_I:.*:]] ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IVNEXT:%.*]], %[[DOTLR_PH_I]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll b/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll index b001b6fa56208..f62c3c7f42ec4 100644 --- a/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll +++ b/llvm/test/Transforms/LoopVectorize/opaque-ptr.ll @@ -64,8 +64,8 @@ define void @test_ptr_iv_no_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P1_START]], [[VECTOR_MEMCHECK]] ], [ [[P1_START]], [[VECTOR_SCEVCHECK]] ], [ [[P1_START]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END8]], [[MIDDLE_BLOCK]] ], [ [[P2_START]], [[VECTOR_MEMCHECK]] ], [ [[P2_START]], [[VECTOR_SCEVCHECK]] ], [ [[P2_START]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P1_START]], [[ENTRY:%.*]] ], [ [[P1_START]], [[VECTOR_SCEVCHECK]] ], [ [[P1_START]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[IND_END8]], [[MIDDLE_BLOCK]] ], [ [[P2_START]], [[ENTRY]] ], [ [[P2_START]], [[VECTOR_SCEVCHECK]] ], [ [[P2_START]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[P1:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P1_NEXT:%.*]], [[LOOP]] ] @@ -154,8 +154,8 @@ define void @test_ptr_iv_with_inbounds(ptr %p1.start, ptr %p2.start, ptr %p1.end ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P1_START]], [[VECTOR_MEMCHECK]] ], [ [[P1_START]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi ptr [ [[IND_END6]], [[MIDDLE_BLOCK]] ], [ [[P2_START]], [[VECTOR_MEMCHECK]] ], [ [[P2_START]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P1_START]], [[ENTRY:%.*]] ], [ [[P1_START]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi ptr [ [[IND_END6]], [[MIDDLE_BLOCK]] ], [ [[P2_START]], [[ENTRY]] ], [ [[P2_START]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[P1:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P1_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll index 06ac6e75cd74b..d328b565b83fe 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll @@ -70,7 +70,7 @@ define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: -; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] ; VF-TWO-CHECK: for.body: ; VF-TWO-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll index 065e38e9fa5cf..4db0153fe8491 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -76,7 +76,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -234,8 +234,8 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -363,7 +363,7 @@ define void @f3(ptr noalias %A, i64 %n) { ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -422,7 +422,7 @@ define void @f3(ptr noalias %A, i64 %n) { ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.scalar.ph: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: for.body: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -528,8 +528,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[OUTER_LATCH]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 85, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1, [[ITER_CHECK]] ], [ 85, [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i8 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 85, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, [[VEC_EPILOG_ITER_CHECK]] ], [ 1, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i8 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[INNER:%.*]] ; CHECK: inner: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ] @@ -618,8 +618,8 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst ; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.middle.block: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 true, label [[OUTER_LATCH]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.scalar.ph: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 85, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1, [[ITER_CHECK]] ], [ 85, [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i8 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 85, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, [[VEC_EPILOG_ITER_CHECK]] ], [ 1, [[ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i8 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[INNER:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: inner: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[INNER]] ] @@ -720,7 +720,7 @@ define void @f4(ptr noalias %A, i32 signext %n) { ; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N5]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -790,7 +790,7 @@ define void @f4(ptr noalias %A, i32 signext %n) { ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N5]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.scalar.ph: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[LOOP:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: loop: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -894,8 +894,8 @@ define void @multiple_ivs_wide(ptr %dst) { ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ 128, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 128, [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ 64, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ -64, [[ITER_CHECK]] ], [ 64, [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ 128, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 128, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ 64, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 64, [[VEC_EPILOG_ITER_CHECK]] ], [ -64, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index 03812f4acb1eb..738f265b89d65 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -489,7 +489,7 @@ define void @pr43371_pgso() !prof !14 { ; NPGSO: [[MIDDLE_BLOCK]]: ; NPGSO-NEXT: br i1 true, label %[[FOR_COND_CLEANUP28:.*]], label %[[SCALAR_PH]] ; NPGSO: [[SCALAR_PH]]: -; NPGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; NPGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 756, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; NPGSO-NEXT: br label %[[FOR_BODY29:.*]] ; NPGSO: [[FOR_COND_CLEANUP28]]: ; NPGSO-NEXT: unreachable @@ -854,7 +854,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -888,7 +888,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; PGSO: [[MIDDLE_BLOCK]]: ; PGSO-NEXT: br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; PGSO: [[SCALAR_PH]]: -; PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; PGSO-NEXT: br label %[[FOR_BODY:.*]] ; PGSO: [[FOR_BODY]]: ; PGSO-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -922,7 +922,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 { ; NPGSO: [[MIDDLE_BLOCK]]: ; NPGSO-NEXT: br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]] ; NPGSO: [[SCALAR_PH]]: -; NPGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; NPGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; NPGSO-NEXT: br label %[[FOR_BODY:.*]] ; NPGSO: [[FOR_BODY]]: ; NPGSO-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll index 00daf7d34483c..e652d86944c4d 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-select-runtime-checks.ll @@ -42,7 +42,7 @@ define void @test1_select_invariant(ptr %src.1, ptr %src.2, ptr %dst, i1 %c, i8 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -121,7 +121,7 @@ define void @test_loop_dependent_select1(ptr %src.1, ptr %src.2, ptr %dst, i1 %c ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -211,7 +211,7 @@ define void @test_loop_dependent_select2(ptr %src.1, ptr %src.2, ptr %dst, i8 %n ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -296,7 +296,7 @@ define void @test_loop_dependent_select_first_ptr_noundef(ptr noundef %src.1, pt ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -381,7 +381,7 @@ define void @test_loop_dependent_select_second_ptr_noundef(ptr %src.1, ptr nound ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll index 2741b39693579..0cda697e0337a 100644 --- a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll @@ -92,8 +92,8 @@ define void @doit1(i32 %n, i32 %step) local_unnamed_addr { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -215,8 +215,8 @@ define void @doit2(i32 %n, i32 %step) local_unnamed_addr { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -410,8 +410,8 @@ define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll index fe660a8269672..767fd2e188aae 100644 --- a/llvm/test/Transforms/LoopVectorize/pr37248.ll +++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll @@ -71,7 +71,7 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_SCEVCHECK]] ], [ [[START]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ], [ [[START]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[LAND_END:%.*]] ] @@ -156,7 +156,7 @@ define void @f2(ptr noalias %b, i1 %c, i32 %start) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[VECTOR_SCEVCHECK]] ], [ [[START]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ], [ [[START]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP16:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[LAND_END:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll index 9ae8f69b50a90..6baed089fb6b6 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45259.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll @@ -58,7 +58,7 @@ define i8 @widget(ptr %arr, i8 %t9) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[FOR_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll index 3d85a7289637f..6aaa44344ae46 100644 --- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll +++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll @@ -54,7 +54,7 @@ define void @f() { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 500, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr50686.ll b/llvm/test/Transforms/LoopVectorize/pr50686.ll index 4d8562247871a..6ce491e53c256 100644 --- a/llvm/test/Transforms/LoopVectorize/pr50686.ll +++ b/llvm/test/Transforms/LoopVectorize/pr50686.ll @@ -40,7 +40,7 @@ define void @m(ptr nocapture %p, ptr nocapture %p2, i32 %q) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[FOR_END17:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 60, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 60, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_COND5:%.*]] ; CHECK: for.cond5: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND5]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll index a4b229d0a96b2..95ae2de117666 100644 --- a/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll +++ b/llvm/test/Transforms/LoopVectorize/pr59319-loop-access-info-invalidation.ll @@ -63,7 +63,7 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) { ; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N10]], label [[LOOP_3_LR_PH:%.*]], label [[SCALAR_PH5]] ; CHECK: scalar.ph5: -; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK4]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[LOOP_2_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK4]] ], [ 0, [[LOOP_2_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP_2:%.*]] ; CHECK: loop.3.lr.ph: ; CHECK-NEXT: [[IDXPROM_I_I61:%.*]] = and i64 [[IV761_LCSSA]], 1 @@ -93,7 +93,7 @@ define void @reduced(ptr %0, ptr %1, i64 %iv, ptr %2, i64 %iv76, i64 %iv93) { ; CHECK-NEXT: [[CMP_N27:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC25]] ; CHECK-NEXT: br i1 [[CMP_N27]], label [[LOOP_CLEANUP:%.*]], label [[SCALAR_PH22]] ; CHECK: scalar.ph22: -; CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK21]] ], [ 0, [[VECTOR_MEMCHECK15]] ], [ 0, [[LOOP_3_LR_PH]] ] +; CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC25]], [[MIDDLE_BLOCK21]] ], [ 0, [[LOOP_3_LR_PH]] ], [ 0, [[VECTOR_MEMCHECK15]] ] ; CHECK-NEXT: br label [[LOOP_3:%.*]] ; CHECK: loop.2: ; CHECK-NEXT: [[IV846:%.*]] = phi i64 [ [[IV_NEXT85:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL13]], [[SCALAR_PH5]] ] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-align.ll b/llvm/test/Transforms/LoopVectorize/reduction-align.ll index 3216c9233ea21..028eb3b05957d 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-align.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-align.ll @@ -40,8 +40,8 @@ define void @fn(ptr %hbuf, ptr %ref, i32 %height) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[HEIGHT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP4:%.*]] = phi i16 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll index 5028dc5355c50..aff61aea3f9e9 100644 --- a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll +++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll @@ -196,9 +196,9 @@ define i32 @reverse_induction_i16(i16 %startval, ptr %ptr) { ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: br i1 true, label %[[LOOPEND:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[STARTVAL]], %[[VECTOR_SCEVCHECK]] ], [ [[STARTVAL]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[STARTVAL]], %[[ENTRY]] ], [ [[STARTVAL]], %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ 1024, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[ADD_I7:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll index e14c547d96ad3..7b2af60fcfd23 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-needed-but-empty.ll @@ -31,7 +31,7 @@ define void @test(ptr %A, i32 %x) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 undef, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ undef, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ undef, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll index a3bf5c76a20a6..bb515cd583e5b 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll @@ -47,7 +47,7 @@ define void @load_clamped_index(ptr %A, ptr %B, i32 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -120,7 +120,7 @@ define void @store_clamped_index(ptr %A, ptr %B, i32 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -201,7 +201,7 @@ define void @load_clamped_index_offset_1(ptr %A, ptr %B, i32 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_MEMCHECK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -384,7 +384,7 @@ define void @clamped_index_equal_dependence(ptr %A, ptr %B, i32 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll index e0e80271c0d26..5c817ea313183 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -42,7 +42,7 @@ define i32 @foo(ptr nocapture %a, ptr nocapture %b, i32 %n) nounwind uwtable ssp ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]], !dbg [[DBG9]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG9]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], !dbg [[DBG9]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ], !dbg [[DBG9]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG9]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], !dbg [[DBG9]] @@ -147,7 +147,7 @@ define void @test_runtime_check(ptr %a, float %b, i64 %offset, i64 %offset2, i64 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll index 038b482209592..d1324314eb953 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference-simplifications.ll @@ -115,7 +115,7 @@ define void @test_large_number_of_group(ptr %dst, i64 %off, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -259,7 +259,7 @@ define void @check_creation_order(ptr %a, ptr %b, i32 %m) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 31996, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 31996, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll index 1e05f201a1f18..a5501a0037b2c 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll @@ -83,7 +83,7 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] @@ -205,7 +205,7 @@ define void @full_checks(ptr nocapture noundef %dst, ptr nocapture noundef reado ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] @@ -337,7 +337,7 @@ define void @full_checks_diff_strides(ptr nocapture noundef %dst, ptr nocapture ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] @@ -452,7 +452,7 @@ define void @diff_checks_src_start_invariant(ptr nocapture noundef writeonly %ds ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] @@ -566,7 +566,7 @@ define void @full_checks_src_start_invariant(ptr nocapture noundef %dst, ptr noc ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] @@ -711,7 +711,7 @@ define void @triple_nested_loop_mixed_access(ptr nocapture noundef %dst, ptr noc ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -869,7 +869,7 @@ define void @uncomputable_outer_tc(ptr nocapture noundef %dst, ptr nocapture nou ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ] @@ -1038,7 +1038,7 @@ define void @decreasing_inner_iv(ptr nocapture noundef %dst, ptr nocapture nound ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP15]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[OUTER_LOOP]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ] @@ -1200,7 +1200,7 @@ define void @decreasing_outer_iv(ptr nocapture noundef %dst, ptr nocapture nound ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ] @@ -1355,7 +1355,7 @@ define void @unknown_inner_stride(ptr nocapture noundef %dst, ptr nocapture noun ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ] @@ -1470,7 +1470,7 @@ define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr noca ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] @@ -1558,7 +1558,7 @@ define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[OUTER_LATCH]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_HEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_HEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER:%.*]] ; CHECK: inner: ; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ] diff --git a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll index 63bf01fe604e8..a848f98e0949e 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll @@ -50,24 +50,24 @@ define void @test_pr63368(i1 %c, ptr %A) { ; CHECK: vector.ph4: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: br label [[VECTOR_BODY5:%.*]] ; CHECK: vector.body5: -; CHECK-NEXT: [[INDEX8:%.*]] = phi i32 [ 0, [[VECTOR_PH4]] ], [ [[INDEX_NEXT9:%.*]], [[VECTOR_BODY5]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX8]] to i8 -; CHECK-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = add i8 [[TMP14]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i8 [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 -; CHECK-NEXT: store <4 x i8> zeroinitializer, ptr [[TMP17]], align 1 -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX8]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT9]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK2:%.*]], label [[VECTOR_BODY5]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[INDEX6:%.*]] = phi i32 [ 0, [[VECTOR_PH4]] ], [ [[INDEX_NEXT7:%.*]], [[VECTOR_BODY5]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX6]] to i8 +; CHECK-NEXT: [[TMP10:%.*]] = add i8 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = add i8 [[TMP10]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i8 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store <4 x i8> zeroinitializer, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i32 [[INDEX6]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT7]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK2:%.*]], label [[VECTOR_BODY5]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block2: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_2:%.*]], label [[SCALAR_PH3]] ; CHECK: scalar.ph3: -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK2]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[EXIT_1]] ] +; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[TMP9]], [[MIDDLE_BLOCK2]] ], [ 0, [[EXIT_1]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP_2:%.*]] ; CHECK: loop.2: ; CHECK-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL8]], [[SCALAR_PH3]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ] diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll index 20053cd8661d1..122469b8c95e6 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll @@ -48,7 +48,7 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[LOOP]] ] @@ -204,8 +204,8 @@ define void @implied_wrap_predicate(ptr %A, ptr %B, ptr %C) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_MEMCHECK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ 1, [[VECTOR_MEMCHECK]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll index 60f85e7d5936a..576a971c5eaa8 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp-multiuse.ll @@ -549,9 +549,9 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) { ; CHECK-VF4-IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF4-IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF4-IC1: scalar.ph: -; CHECK-VF4-IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[VECTOR_MEMCHECK]] ], [ true, [[ENTRY]] ] -; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i1 [ [[RDX_SELECT9]], [[MIDDLE_BLOCK]] ], [ false, [[VECTOR_MEMCHECK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ], [ true, [[VECTOR_MEMCHECK]] ] +; CHECK-VF4-IC1-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i1 [ [[RDX_SELECT9]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ], [ false, [[VECTOR_MEMCHECK]] ] ; CHECK-VF4-IC1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF4-IC1: for.body: ; CHECK-VF4-IC1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END6:%.*]] ] @@ -711,9 +711,9 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) { ; CHECK-VF4-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF4-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF4-IC2: scalar.ph: -; CHECK-VF4-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[VECTOR_MEMCHECK]] ], [ true, [[ENTRY]] ] -; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX22:%.*]] = phi i1 [ [[RDX_SELECT21]], [[MIDDLE_BLOCK]] ], [ false, [[VECTOR_MEMCHECK]] ], [ false, [[ENTRY]] ] +; CHECK-VF4-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ], [ true, [[VECTOR_MEMCHECK]] ] +; CHECK-VF4-IC2-NEXT: [[BC_MERGE_RDX22:%.*]] = phi i1 [ [[RDX_SELECT21]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ], [ false, [[VECTOR_MEMCHECK]] ] ; CHECK-VF4-IC2-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF4-IC2: for.body: ; CHECK-VF4-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END6:%.*]] ] @@ -808,9 +808,9 @@ define i32 @multi_user_cmp_branch_use(ptr readonly %a, ptr %b, i64 noundef %n) { ; CHECK-VF1-IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF1-IC2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF1-IC2: scalar.ph: -; CHECK-VF1-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[VECTOR_MEMCHECK]] ], [ true, [[ENTRY]] ] -; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i1 [ [[RDX_SELECT8]], [[MIDDLE_BLOCK]] ], [ false, [[VECTOR_MEMCHECK]] ], [ false, [[ENTRY]] ] +; CHECK-VF1-IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ true, [[ENTRY]] ], [ true, [[VECTOR_MEMCHECK]] ] +; CHECK-VF1-IC2-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i1 [ [[RDX_SELECT8]], [[MIDDLE_BLOCK]] ], [ false, [[ENTRY]] ], [ false, [[VECTOR_MEMCHECK]] ] ; CHECK-VF1-IC2-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF1-IC2: for.body: ; CHECK-VF1-IC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END6:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll index 4bcf8e0180d63..b414f53291df4 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll @@ -217,8 +217,8 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll index 75472d29d6ed0..d9827a8c71ee6 100644 --- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll @@ -59,7 +59,7 @@ define i16 @test(ptr %arg, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[LOOP_3_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP_3:%.*]] ; CHECK: inner.latch: ; CHECK-NEXT: [[C_4:%.*]] = call i1 @cond() diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 791c995d88c14..16e071dec9604 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -40,8 +40,8 @@ define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -115,8 +115,8 @@ define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -188,7 +188,7 @@ define void @versioned_sext_use_in_gep(i32 %scale, ptr %dst, i64 %scale.2) { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -274,8 +274,8 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1, ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[OUTER_HEADER_LOOPEXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_1]], [[INNER_LOOP_PREHEADER]] ], [ [[IV_1]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP]] ; CHECK: inner.loop: ; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[IV_2_NEXT]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -371,8 +371,8 @@ define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ] @@ -448,7 +448,7 @@ define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -510,7 +510,7 @@ define void @sext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll index a816c1a7cedee..b056f44a6c469 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll @@ -76,7 +76,7 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]] ; CHECK: for.body.preheader13: -; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER13]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll index 886e7a758d053..089511d5b2e57 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -147,7 +147,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[CONV6]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[VECTOR_MEMCHECK_1:%.*]], label [[FOR_BODY4_US_PREHEADER]] ; CHECK: for.body4.us.preheader: -; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.us: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY4_US_PREHEADER]] ] @@ -223,7 +223,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[CMP_N_1:%.*]] = icmp eq i64 [[N_VEC_1]], [[CONV6]] ; CHECK-NEXT: br i1 [[CMP_N_1]], label [[VECTOR_MEMCHECK_2:%.*]], label [[FOR_BODY4_US_PREHEADER_1]] ; CHECK: for.body4.us.preheader.1: -; CHECK-NEXT: [[INDVARS_IV_PH_1:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK_1]] ], [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[N_VEC_1]], [[MIDDLE_BLOCK_1]] ] +; CHECK-NEXT: [[INDVARS_IV_PH_1:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ 0, [[VECTOR_MEMCHECK_1]] ], [ [[N_VEC_1]], [[MIDDLE_BLOCK_1]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US_1:%.*]] ; CHECK: for.body4.us.1: ; CHECK-NEXT: [[INDVARS_IV_1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY4_US_1]] ], [ [[INDVARS_IV_PH_1]], [[FOR_BODY4_US_PREHEADER_1]] ] @@ -300,7 +300,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[CMP_N_2:%.*]] = icmp eq i64 [[N_VEC_2]], [[CONV6]] ; CHECK-NEXT: br i1 [[CMP_N_2]], label [[VECTOR_MEMCHECK_3:%.*]], label [[FOR_BODY4_US_PREHEADER_2]] ; CHECK: for.body4.us.preheader.2: -; CHECK-NEXT: [[INDVARS_IV_PH_2:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK_2]] ], [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[N_VEC_2]], [[MIDDLE_BLOCK_2]] ] +; CHECK-NEXT: [[INDVARS_IV_PH_2:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ 0, [[VECTOR_MEMCHECK_2]] ], [ [[N_VEC_2]], [[MIDDLE_BLOCK_2]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US_2:%.*]] ; CHECK: for.body4.us.2: ; CHECK-NEXT: [[INDVARS_IV_2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_2:%.*]], [[FOR_BODY4_US_2]] ], [ [[INDVARS_IV_PH_2]], [[FOR_BODY4_US_PREHEADER_2]] ] @@ -377,7 +377,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea ; CHECK-NEXT: [[CMP_N_3:%.*]] = icmp eq i64 [[N_VEC_3]], [[CONV6]] ; CHECK-NEXT: br i1 [[CMP_N_3]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY4_US_PREHEADER_3]] ; CHECK: for.body4.us.preheader.3: -; CHECK-NEXT: [[INDVARS_IV_PH_3:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK_3]] ], [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[N_VEC_3]], [[MIDDLE_BLOCK_3]] ] +; CHECK-NEXT: [[INDVARS_IV_PH_3:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ 0, [[VECTOR_MEMCHECK_3]] ], [ [[N_VEC_3]], [[MIDDLE_BLOCK_3]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US_3:%.*]] ; CHECK: for.body4.us.3: ; CHECK-NEXT: [[INDVARS_IV_3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY4_US_3]] ], [ [[INDVARS_IV_PH_3]], [[FOR_BODY4_US_PREHEADER_3]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll index 9196e3c96462a..2fe420183c683 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll @@ -109,7 +109,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]] ; AVX2-NEXT: br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]] ; AVX2: bb12.preheader: -; AVX2-NEXT: [[PTR2_PH:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[BB12_PREHEADER]] ], [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; AVX2-NEXT: [[PTR2_PH:%.*]] = phi ptr [ [[START]], [[BB12_PREHEADER]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; AVX2-NEXT: br label [[BB13:%.*]] ; AVX2: bb12: ; AVX2-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[PTR2_PH]], [[BB12_PREHEADER1]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll index 703a53949a063..7817c23e6a3ec 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -90,7 +90,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[N_VEC11]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[CMP_N17]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[TMP43:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[INDVARS_IV_PH]] ; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP43]], 7 ; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 From 09a500b3db5e99db4b5c7d5ac95c3aa99c191adf Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Sun, 9 Feb 2025 13:57:41 +0100 Subject: [PATCH 088/293] [ValueTracking] more test of trunc to i1 as condition in dominating condition. (NFC) --- .../test/Transforms/InstCombine/known-bits.ll | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index a3872fefecf3b..5b36684d7149b 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -2313,6 +2313,58 @@ if.else: ret i8 %c } +define i8 @test_trunc_cond_and(i8 %x, i1 %c) { +; CHECK-LABEL: @test_trunc_cond_and( +; CHECK-NEXT: [[CMP:%.*]] = trunc i8 [[X:%.*]] to i1 +; CHECK-NEXT: [[COND:%.*]] = and i1 [[C:%.*]], [[CMP]] +; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] +; CHECK: if: +; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -2 +; CHECK-NEXT: ret i8 [[OR1]] +; CHECK: exit: +; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -2 +; CHECK-NEXT: ret i8 [[OR2]] +; + %cmp = trunc i8 %x to i1 + %cond = and i1 %cmp, %c + br i1 %cond, label %if, label %exit + +if: + %or1 = or i8 %x, -2 + ret i8 %or1 + +exit: + %or2 = or i8 %x, -2 + ret i8 %or2 +} + +define i8 @test_not_trunc_cond_and(i8 %x, i1 %c) { +; CHECK-LABEL: @test_not_trunc_cond_and( +; CHECK-NEXT: [[CMP:%.*]] = trunc i8 [[X:%.*]] to i1 +; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[CMP]], true +; CHECK-NEXT: [[COND:%.*]] = and i1 [[C:%.*]], [[NOT]] +; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] +; CHECK: if: +; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -2 +; CHECK-NEXT: ret i8 [[OR1]] +; CHECK: exit: +; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -2 +; CHECK-NEXT: ret i8 [[OR2]] +; + %cmp = trunc i8 %x to i1 + %not = xor i1 %cmp, true + %cond = and i1 %not, %c + br i1 %cond, label %if, label %exit + +if: + %or1 = or i8 %x, -2 + ret i8 %or1 + +exit: + %or2 = or i8 %x, -2 + ret i8 %or2 +} + declare void @dummy() declare void @use(i1) declare void @sink(i8) From 4c470d0c933cee57843052e0783f6f11a20bd820 Mon Sep 17 00:00:00 2001 From: Ritanya-B-Bharadwaj Date: Sun, 9 Feb 2025 18:33:03 +0530 Subject: [PATCH 089/293] =?UTF-8?q?This=20commit=20fixes=20the=20build=20f?= =?UTF-8?q?ailure=20due=20to=20OMP=5FTRAIT=5FPROPERTY=20macro=20r=E2=80=A6?= =?UTF-8?q?=20(#126222)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …edefinition - https://github.com/llvm/llvm-project/issues/126043 --- llvm/lib/Frontend/OpenMP/OMPContext.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp index cca5f8806f9e5..5e13da172d677 100644 --- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp @@ -67,7 +67,7 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple, TargetOffloadTriple.getArch() == Triple::x86_64) \ ActiveTraits.set(unsigned(TraitProperty::Enum)); \ } -#undef OMP_TRAIT_PROPERTY +#include "llvm/Frontend/OpenMP/OMPKinds.def" } else { // Add the appropriate device kind trait based on the triple and the // IsDeviceCompilation flag. From a32efd8edc6ec5f80ffa16b3d4e52e6407d5fe99 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Sun, 9 Feb 2025 07:32:50 -0600 Subject: [PATCH 090/293] [Clang] Disable failing offload test on darwin Summary: We don't support offloading on Darwin. This fails because there's some handling missing somewhere else that likely won't ever be added. --- clang/test/Driver/offload-Xarch.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/test/Driver/offload-Xarch.c b/clang/test/Driver/offload-Xarch.c index 8856dac198465..0f8f40a5cbd74 100644 --- a/clang/test/Driver/offload-Xarch.c +++ b/clang/test/Driver/offload-Xarch.c @@ -1,3 +1,5 @@ +// UNSUPPORTED: target={{.*darwin.*}} + // RUN: %clang --target=x86_64-unknown-linux-gnu -x cuda %s -Xarch_nvptx64 -O3 -S -nogpulib -nogpuinc -### 2>&1 | FileCheck -check-prefix=O3ONCE %s // RUN: %clang -x cuda %s -Xarch_device -O3 -S -nogpulib -nogpuinc -### 2>&1 | FileCheck -check-prefix=O3ONCE %s // RUN: %clang -x hip %s -Xarch_amdgcn -O3 -S -nogpulib -nogpuinc -### 2>&1 | FileCheck -check-prefix=O3ONCE %s From b1a267e1b9e9b50ba5b99de014ed056bf201b762 Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Sun, 9 Feb 2025 13:54:11 +0000 Subject: [PATCH 091/293] [mlir][vector] Remove references to non-existing patterns (nfc) Delete references to: * `VectorLoadToMemrefLoadLowering`, * `VectorStoreToMemrefStoreLowering`. These patters were removed in #121454. --- .../mlir/Dialect/Vector/Transforms/LoweringPatterns.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h index c507b23c6d4de..6aeae30a0a6c0 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h +++ b/mlir/include/mlir/Dialect/Vector/Transforms/LoweringPatterns.h @@ -157,12 +157,6 @@ void populateVectorTransposeLoweringPatterns(RewritePatternSet &patterns, /// Progressive lowering of transfer_write. This pattern supports lowering of /// `vector.transfer_write` to `vector.store` /// -/// [VectorLoadToMemrefLoadLowering] -/// Replace a 0-d vector.load with a memref.load + vector.broadcast. -/// -/// [VectorStoreToMemrefStoreLowering] -/// Replace a 0-d vector.store with a vector.extractelement + memref.store. -/// /// These patterns lower transfer ops to simpler ops like `vector.load`, /// `vector.store` and `vector.broadcast`. Only transfers with a transfer rank /// of a most `maxTransferRank` are lowered. This is useful when combined with From 8a4707bf1de659f569558ab32d4c7cf5029acd3f Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sun, 9 Feb 2025 15:04:25 +0100 Subject: [PATCH 092/293] [NFC][libc++] Fixes minor issues in the synopsis. --- libcxx/include/chrono | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libcxx/include/chrono b/libcxx/include/chrono index bd4c98600440c..b39f060bf6b08 100644 --- a/libcxx/include/chrono +++ b/libcxx/include/chrono @@ -922,6 +922,8 @@ strong_ordering operator<=>(const time_zone_link& x, const time_zone_link& y); } // chrono namespace std { + template + struct formatter, charT>; // C++20 template struct formatter, charT>; // C++20 template @@ -929,11 +931,9 @@ namespace std { template struct formatter, charT>; // C++20 template - struct formatter, charT>; // C++20 + struct formatter, charT>; // C++20 template struct formatter, charT>; // C++20 - template - struct formatter, charT>; // C++20 template struct formatter; // C++20 template struct formatter; // C++20 template struct formatter; // C++20 From cb1b51f4ff4e2a179dddf492e3310343f53a9ba1 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sun, 9 Feb 2025 15:11:13 +0100 Subject: [PATCH 093/293] [libc++][doc] Updates format status. --- libcxx/docs/Status/FormatPaper.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/docs/Status/FormatPaper.csv b/libcxx/docs/Status/FormatPaper.csv index beec97b8c0179..6ddae9e2a1518 100644 --- a/libcxx/docs/Status/FormatPaper.csv +++ b/libcxx/docs/Status/FormatPaper.csv @@ -2,7 +2,7 @@ Section,Description,Dependencies,Assignee,Status,First released version `P1361 `__ `P2372 `__,"Formatting chrono" `[time.syn] `_,"Formatter ``chrono::duration``",,Mark de Wever,|Complete|,16 `[time.syn] `_,"Formatter ``chrono::sys_time``",,Mark de Wever,|Complete|,17 -`[time.syn] `_,"Formatter ``chrono::utc_time``",A ```` implementation,Mark de Wever,|Complete|,20 +`[time.syn] `_,"Formatter ``chrono::utc_time``",,Mark de Wever,|Complete|,20 `[time.syn] `_,"Formatter ``chrono::tai_time``",,Mark de Wever,|Complete|,21 `[time.syn] `_,"Formatter ``chrono::gps_time``",A ```` implementation,Mark de Wever,,, `[time.syn] `_,"Formatter ``chrono::file_time``",,Mark de Wever,|Complete|,17 From 70906f0514826b5e64bd9354210ae836740c2053 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 9 Feb 2025 15:02:41 +0000 Subject: [PATCH 094/293] [LV][X86] Regenerate interleaved load/store costs. NFC. update_analyze_test_checks has improved the checks since these were last updated. Reduce noise diffs in future patches. --- .../interleaved-load-f32-stride-2.ll | 44 +++++++------- .../interleaved-load-f32-stride-3.ll | 44 +++++++------- .../interleaved-load-f32-stride-4.ll | 43 +++++++------- .../interleaved-load-f32-stride-6.ll | 5 ++ .../interleaved-load-i16-stride-2.ll | 57 ++++++++++--------- .../interleaved-load-i16-stride-3.ll | 57 ++++++++++--------- .../interleaved-load-i16-stride-4.ll | 57 ++++++++++--------- .../interleaved-load-i16-stride-6.ll | 57 ++++++++++--------- ...nterleaved-load-i32-stride-2-indices-0u.ll | 44 +++++++------- .../interleaved-load-i32-stride-2.ll | 44 +++++++------- ...terleaved-load-i32-stride-3-indices-01u.ll | 44 +++++++------- ...terleaved-load-i32-stride-3-indices-0uu.ll | 44 +++++++------- .../interleaved-load-i32-stride-3.ll | 44 +++++++------- ...erleaved-load-i32-stride-4-indices-012u.ll | 43 +++++++------- ...erleaved-load-i32-stride-4-indices-0uuu.ll | 44 +++++++------- .../interleaved-load-i32-stride-4.ll | 43 +++++++------- .../interleaved-load-i32-stride-6.ll | 37 ++++++------ .../CostModel/interleaved-load-i8-stride-2.ll | 57 ++++++++++--------- .../CostModel/interleaved-load-i8-stride-3.ll | 57 ++++++++++--------- .../CostModel/interleaved-load-i8-stride-4.ll | 57 ++++++++++--------- .../interleaved-store-f32-stride-2.ll | 44 +++++++------- .../interleaved-store-f32-stride-3.ll | 44 +++++++------- .../interleaved-store-f32-stride-4.ll | 44 +++++++------- .../interleaved-store-f32-stride-5.ll | 38 +++++++------ .../interleaved-store-f32-stride-6.ll | 38 +++++++------ .../interleaved-store-f32-stride-7.ll | 36 ++++++------ .../interleaved-store-f64-stride-2.ll | 44 +++++++------- .../interleaved-store-f64-stride-3.ll | 38 +++++++------ .../interleaved-store-f64-stride-4.ll | 36 ++++++------ .../interleaved-store-f64-stride-5.ll | 30 +++++----- .../interleaved-store-f64-stride-6.ll | 30 +++++----- .../interleaved-store-f64-stride-7.ll | 30 +++++----- .../interleaved-store-i16-stride-2.ll | 57 ++++++++++--------- .../interleaved-store-i16-stride-3.ll | 57 ++++++++++--------- .../interleaved-store-i16-stride-4.ll | 57 ++++++++++--------- .../interleaved-store-i16-stride-5.ll | 57 ++++++++++--------- .../interleaved-store-i16-stride-6.ll | 57 ++++++++++--------- .../interleaved-store-i16-stride-7.ll | 57 ++++++++++--------- .../interleaved-store-i16-stride-8.ll | 5 ++ .../interleaved-store-i32-stride-2.ll | 44 +++++++------- .../interleaved-store-i32-stride-3.ll | 44 +++++++------- .../interleaved-store-i32-stride-4.ll | 44 +++++++------- .../interleaved-store-i32-stride-5.ll | 38 +++++++------ .../interleaved-store-i32-stride-6.ll | 38 +++++++------ .../interleaved-store-i32-stride-7.ll | 36 ++++++------ .../interleaved-store-i32-stride-8.ll | 4 ++ .../interleaved-store-i64-stride-2.ll | 44 +++++++------- .../interleaved-store-i64-stride-3.ll | 38 +++++++------ .../interleaved-store-i64-stride-4.ll | 36 ++++++------ .../interleaved-store-i64-stride-5.ll | 30 +++++----- .../interleaved-store-i64-stride-6.ll | 30 +++++----- .../interleaved-store-i64-stride-7.ll | 30 +++++----- .../interleaved-store-i8-stride-2.ll | 57 ++++++++++--------- .../interleaved-store-i8-stride-3.ll | 57 ++++++++++--------- .../interleaved-store-i8-stride-4.ll | 57 ++++++++++--------- .../interleaved-store-i8-stride-5.ll | 57 ++++++++++--------- .../interleaved-store-i8-stride-6.ll | 57 ++++++++++--------- .../interleaved-store-i8-stride-7.ll | 57 ++++++++++--------- .../interleaved-store-i8-stride-8.ll | 5 ++ 59 files changed, 1393 insertions(+), 1131 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll index a3aab8274391a..b14e2a2a597da 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; SSE2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX1: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 30 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 60 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 120 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 22 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 92 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 92 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll index ed51006a89543..4986d48b5eff9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; SSE2: Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 42 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 84 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX1: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 45 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 90 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 180 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 45 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 90 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 180 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 20 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 44 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 51 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 210 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll index efa8cd7357684..6a3e2471f393a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll @@ -14,34 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; SSE2: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX1: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 60 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 120 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 240 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 20 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 84 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 22 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 92 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll index 46ff5001d6572..600381d8f8c02 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-6.ll @@ -14,12 +14,14 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 ; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 ; SSE2: LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; SSE2: LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX1: LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX1: LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX1: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 @@ -27,6 +29,7 @@ define void @test() { ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 37 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 @@ -34,11 +37,13 @@ define void @test() { ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll index 56ed11d68b117..39f9bec780425 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; SSE2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 68 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 68 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX1: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 140 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 7 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 11 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 22 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 11 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512DQ: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 7 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 10 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 20 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 284 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 10 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 20 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 284 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512BW: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 7 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 34 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 7 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 34 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll index f996f35240747..7371ee8b62df1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; SSE2: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 26 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 102 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 26 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 102 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX1: Cost of 15 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 105 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 210 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 105 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 31 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 62 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 11 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 31 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 62 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512DQ: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 30 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 59 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 426 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 59 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 426 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512BW: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 7 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 18 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 81 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 18 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 81 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll index 7ee418e7b12ad..85254340fecff 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; SSE2: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 34 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 68 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 136 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 34 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 68 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 136 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX1: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 34 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 68 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 140 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 280 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 34 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 68 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 35 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 79 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 158 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 79 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 158 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512DQ: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 77 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 154 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 568 for VF 64: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 77 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 154 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 568 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512BW: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 34 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 148 for VF 64: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 34 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 148 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll index 00367457ae3eb..b46969f1c5ea8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; SSE2: Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; SSE2: Cost of 51 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; SSE2: Cost of 102 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 -; SSE2: Cost of 204 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX1: Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX1: Cost of 51 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX1: Cost of 102 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX1: Cost of 210 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX1: Cost of 420 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 102 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 420 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX2: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX2: Cost of 42 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX2: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX2: Cost of 224 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 224 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512DQ: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512DQ: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512DQ: Cost of 41 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512DQ: Cost of 109 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512DQ: Cost of 218 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512DQ: Cost of 852 for VF 64: INTERLEAVE-GROUP with factor 6 at %v0 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 109 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 218 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 852 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX512BW: Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512BW: Cost of 13 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512BW: Cost of 17 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512BW: Cost of 33 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512BW: Cost of 81 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512BW: Cost of 342 for VF 64: INTERLEAVE-GROUP with factor 6 at %v0 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 13 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 13 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 81 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 342 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll index 312ed6d8536ef..3704264f4e221 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 30 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 60 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 21 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 42 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 84 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 2 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 42 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 4 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 1 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 1 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 1 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 2 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 13 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 50 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 13 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 50 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll index 94d1e685ee670..196f4aae3a485 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 60 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 120 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 38 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 76 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 152 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 38 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 152 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 22 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512: Cost of 92 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 92 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll index 063bf0819f0a9..679d74c4e6ed3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 31 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 62 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 124 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 124 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 19 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 160 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 19 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 34 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 36 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 144 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 144 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll index 2754efe4a85ab..5472915442b83 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 68 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 68 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 23 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 46 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 92 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 23 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 46 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 4 for VF 2: {{.*}}ir<%v0> = load -; AVX2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 11 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 23 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 11 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 23 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 1 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 1 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 3 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 21 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 78 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 3 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 21 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 78 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll index fd8620757835a..09e5816b4841a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 21 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 45 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 90 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 180 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 180 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 27 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 57 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 114 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 228 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 27 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 57 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 20 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 44 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 51 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512: Cost of 210 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll index c095dcc458c0b..880fb82ebacd7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll @@ -14,34 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 21 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 45 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 90 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 180 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 180 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 59 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 118 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 236 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 59 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 118 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 236 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 32 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 67 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 67 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 17 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 71 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 17 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 71 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll index 3ea6f71ee88ba..c76199d0f9765 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 15 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 30 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 60 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 15 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 25 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 50 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 100 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 25 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 100 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 33 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 33 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 1 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 1 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 29 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 80 for VF 64: {{.*}}ir<%v0> = load +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 29 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll index 8d76e696ed615..4b35a71b2b40c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll @@ -14,34 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 120 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 240 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 76 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 152 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 304 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 36 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 20 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 84 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 22 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512: Cost of 92 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll index 58fba399064eb..0bfb4df3ddfe7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll @@ -14,31 +14,36 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; SSE2: Cost of 42 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; SSE2: Cost of 90 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; SSE2: Cost of 180 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 90 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 180 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX1: Cost of 27 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX1: Cost of 54 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX1: Cost of 114 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX1: Cost of 228 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 114 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX2: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX2: Cost of 37 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX2: Cost of 76 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 37 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512: Cost of 21 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512: Cost of 51 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0 -; AVX512: Cost of 210 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll index 2799ab4b5e82b..f5185befc688b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; SSE2: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 30 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 62 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; SSE2: Cost of 126 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX1: Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 33 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 66 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX1: Cost of 134 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 33 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 66 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 134 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX2: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX512DQ: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 7 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512DQ: Cost of 270 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 7 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 270 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX512BW: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 17 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0 -; AVX512BW: Cost of 41 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 17 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 41 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll index 74f4a959d3964..f6b775457123b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; SSE2: Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 50 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 93 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; SSE2: Cost of 189 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 93 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 189 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX1: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 27 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 52 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 99 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX1: Cost of 201 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 27 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 52 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 99 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 201 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX2: Cost of 17 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 17 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX512DQ: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512DQ: Cost of 405 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 14 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 405 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX512BW: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 13 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0 -; AVX512BW: Cost of 25 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 25 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll index d2d73d33df7ca..13d9884727392 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; SSE2: Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 124 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; SSE2: Cost of 252 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 124 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 252 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX1: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 33 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 66 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX1: Cost of 268 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 66 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 132 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 268 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 13 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 26 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX2: Cost of 60 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 26 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX512DQ: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 13 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 25 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 58 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512DQ: Cost of 540 for VF 64: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 25 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 58 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 540 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX512BW: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 17 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 33 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 80 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0 -; AVX512BW: Cost of 238 for VF 64: INTERLEAVE-GROUP with factor 4 at %v0 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 238 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll index e1698c4d0c6cf..85ab5b85c9958 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4 -; SSE2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4 +; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4 +; SSE2: LV: Found an estimated cost of 14 for VF 4 For instruction: store float %v1, ptr %out1, align 4 +; SSE2: LV: Found an estimated cost of 28 for VF 8 For instruction: store float %v1, ptr %out1, align 4 +; SSE2: LV: Found an estimated cost of 56 for VF 16 For instruction: store float %v1, ptr %out1, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4 -; AVX1: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 30 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 60 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 120 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: store float %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 30 for VF 8 For instruction: store float %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 60 for VF 16 For instruction: store float %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 120 for VF 32 For instruction: store float %v1, ptr %out1, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4 -; AVX2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: store float %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: store float %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: store float %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction: store float %v1, ptr %out1, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4 -; AVX512: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 10 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 20 for VF 64: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction: store float %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 8 For instruction: store float %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction: store float %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 10 for VF 32 For instruction: store float %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 20 for VF 64 For instruction: store float %v1, ptr %out1, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll index 787f448ad9651..77abfc65a3e59 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4 -; SSE2: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4 +; SSE2: LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v2, ptr %out2, align 4 +; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction: store float %v2, ptr %out2, align 4 +; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction: store float %v2, ptr %out2, align 4 +; SSE2: LV: Found an estimated cost of 96 for VF 16 For instruction: store float %v2, ptr %out2, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4 -; AVX1: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 22 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 45 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 90 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 180 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: store float %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 22 for VF 4 For instruction: store float %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 45 for VF 8 For instruction: store float %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 90 for VF 16 For instruction: store float %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 180 for VF 32 For instruction: store float %v2, ptr %out2, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4 -; AVX2: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 28 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 60 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: store float %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: store float %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: store float %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: store float %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: store float %v2, ptr %out2, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4 -; AVX512: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 48 for VF 64: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store float %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction: store float %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction: store float %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction: store float %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 24 for VF 32 For instruction: store float %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 48 for VF 64 For instruction: store float %v2, ptr %out2, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll index 4b85900e031d3..699c2eb63cb31 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4 -; SSE2: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4 +; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: store float %v3, ptr %out3, align 4 +; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: store float %v3, ptr %out3, align 4 +; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: store float %v3, ptr %out3, align 4 +; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: store float %v3, ptr %out3, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4 -; AVX1: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 60 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 120 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 240 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: store float %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 28 for VF 4 For instruction: store float %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 60 for VF 8 For instruction: store float %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 120 for VF 16 For instruction: store float %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 240 for VF 32 For instruction: store float %v3, ptr %out3, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4 -; AVX2: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 20 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 80 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: store float %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store float %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: store float %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: store float %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 80 for VF 32 For instruction: store float %v3, ptr %out3, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4 -; AVX512: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 22 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 44 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 88 for VF 64: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store float %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction: store float %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 11 for VF 8 For instruction: store float %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: store float %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 44 for VF 32 For instruction: store float %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 88 for VF 64 For instruction: store float %v3, ptr %out3, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll index 0db57f73f5756..86758b5a24fe9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll @@ -14,32 +14,36 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4 -; SSE2: Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4 +; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction: store float %v4, ptr %out4, align 4 +; SSE2: LV: Found an estimated cost of 44 for VF 4 For instruction: store float %v4, ptr %out4, align 4 +; SSE2: LV: Found an estimated cost of 88 for VF 8 For instruction: store float %v4, ptr %out4, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4 -; AVX1: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 75 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 150 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4 +; AVX1: LV: Found an estimated cost of 16 for VF 2 For instruction: store float %v4, ptr %out4, align 4 +; AVX1: LV: Found an estimated cost of 36 for VF 4 For instruction: store float %v4, ptr %out4, align 4 +; AVX1: LV: Found an estimated cost of 75 for VF 8 For instruction: store float %v4, ptr %out4, align 4 +; AVX1: LV: Found an estimated cost of 150 for VF 16 For instruction: store float %v4, ptr %out4, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4 -; AVX2: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 75 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 150 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4 +; AVX2: LV: Found an estimated cost of 16 for VF 2 For instruction: store float %v4, ptr %out4, align 4 +; AVX2: LV: Found an estimated cost of 36 for VF 4 For instruction: store float %v4, ptr %out4, align 4 +; AVX2: LV: Found an estimated cost of 75 for VF 8 For instruction: store float %v4, ptr %out4, align 4 +; AVX2: LV: Found an estimated cost of 150 for VF 16 For instruction: store float %v4, ptr %out4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4 -; AVX512: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 21 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 35 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 70 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 140 for VF 64: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction: store float %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction: store float %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: store float %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 35 for VF 16 For instruction: store float %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 70 for VF 32 For instruction: store float %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 140 for VF 64 For instruction: store float %v4, ptr %out4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll index 3693ac5771929..53c8f59491e76 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll @@ -14,32 +14,36 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4 -; SSE2: Cost of 21 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 48 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 96 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4 +; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: store float %v5, ptr %out5, align 4 +; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction: store float %v5, ptr %out5, align 4 +; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction: store float %v5, ptr %out5, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4 -; AVX1: Cost of 19 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 42 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 90 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 180 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4 +; AVX1: LV: Found an estimated cost of 19 for VF 2 For instruction: store float %v5, ptr %out5, align 4 +; AVX1: LV: Found an estimated cost of 42 for VF 4 For instruction: store float %v5, ptr %out5, align 4 +; AVX1: LV: Found an estimated cost of 90 for VF 8 For instruction: store float %v5, ptr %out5, align 4 +; AVX1: LV: Found an estimated cost of 180 for VF 16 For instruction: store float %v5, ptr %out5, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4 -; AVX2: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 15 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 39 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 78 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4 +; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v5, ptr %out5, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 4 For instruction: store float %v5, ptr %out5, align 4 +; AVX2: LV: Found an estimated cost of 39 for VF 8 For instruction: store float %v5, ptr %out5, align 4 +; AVX2: LV: Found an estimated cost of 78 for VF 16 For instruction: store float %v5, ptr %out5, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4 -; AVX512: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 25 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 51 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 102 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 204 for VF 64: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: store float %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction: store float %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction: store float %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: store float %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction: store float %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction: store float %v5, ptr %out5, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll index bb61c8810e292..244bceb69f97d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll @@ -14,31 +14,35 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4 -; SSE2: Cost of 23 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 104 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4 +; SSE2: LV: Found an estimated cost of 23 for VF 2 For instruction: store float %v6, ptr %out6, align 4 +; SSE2: LV: Found an estimated cost of 52 for VF 4 For instruction: store float %v6, ptr %out6, align 4 +; SSE2: LV: Found an estimated cost of 104 for VF 8 For instruction: store float %v6, ptr %out6, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4 -; AVX1: Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 50 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 105 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 210 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4 +; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v6, ptr %out6, align 4 +; AVX1: LV: Found an estimated cost of 50 for VF 4 For instruction: store float %v6, ptr %out6, align 4 +; AVX1: LV: Found an estimated cost of 105 for VF 8 For instruction: store float %v6, ptr %out6, align 4 +; AVX1: LV: Found an estimated cost of 210 for VF 16 For instruction: store float %v6, ptr %out6, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4 -; AVX2: Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 50 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 105 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 210 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4 +; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v6, ptr %out6, align 4 +; AVX2: LV: Found an estimated cost of 50 for VF 4 For instruction: store float %v6, ptr %out6, align 4 +; AVX2: LV: Found an estimated cost of 105 for VF 8 For instruction: store float %v6, ptr %out6, align 4 +; AVX2: LV: Found an estimated cost of 210 for VF 16 For instruction: store float %v6, ptr %out6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4 -; AVX512: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: store float %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 20 for VF 4 For instruction: store float %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 8 For instruction: store float %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 70 for VF 16 For instruction: store float %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 140 for VF 32 For instruction: store float %v6, ptr %out6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll index d73f73b826c88..dd6094e4a7d5c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 -; SSE2: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction: store double %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction: store double %v1, ptr %out1, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 -; AVX1: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 112 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 56 for VF 16 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 112 for VF 32 For instruction: store double %v1, ptr %out1, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 -; AVX2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 24 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 48 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v1, ptr %out1, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 -; AVX512: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 10 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 20 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 40 for VF 64: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 16 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 32 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 64 For instruction: store double %v1, ptr %out1, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll index c8e6f78676d14..3f4216bb3a1ef 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll @@ -14,32 +14,36 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 -; SSE2: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 10 for VF 2 For instruction: store double %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 20 for VF 4 For instruction: store double %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 40 for VF 8 For instruction: store double %v2, ptr %out2, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 -; AVX1: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 11 for VF 2 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 24 for VF 4 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 48 for VF 8 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 96 for VF 16 For instruction: store double %v2, ptr %out2, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 -; AVX2: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 18 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 36 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: store double %v2, ptr %out2, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 -; AVX512: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 24 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 48 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 96 for VF 64: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 96 for VF 64 For instruction: store double %v2, ptr %out2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll index cf1aabbd5d877..44f9a14424cda 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll @@ -14,31 +14,35 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 -; SSE2: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: store double %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction: store double %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction: store double %v3, ptr %out3, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 -; AVX1: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 28 for VF 4 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 56 for VF 8 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 112 for VF 16 For instruction: store double %v3, ptr %out3, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 -; AVX2: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction: store double %v3, ptr %out3, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 -; AVX512: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 22 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 44 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 88 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 44 for VF 16 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 88 for VF 32 For instruction: store double %v3, ptr %out3, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll index bad099868de31..5946c4d7df295 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll @@ -14,28 +14,32 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 -; SSE2: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction: store double %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 36 for VF 4 For instruction: store double %v4, ptr %out4, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 -; AVX1: Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 44 for VF 4 For instruction: store double %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 88 for VF 8 For instruction: store double %v4, ptr %out4, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 -; AVX2: Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 44 for VF 4 For instruction: store double %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 88 for VF 8 For instruction: store double %v4, ptr %out4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 -; AVX512: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 35 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 14 for VF 2 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 21 for VF 4 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 35 for VF 8 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 70 for VF 16 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 140 for VF 32 For instruction: store double %v4, ptr %out4, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll index c3b552b0811ad..e9c97283fab18 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll @@ -14,28 +14,32 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 -; SSE2: Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 40 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v5, ptr %out5, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 -; AVX1: Cost of 21 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 48 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 96 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction: store double %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 48 for VF 4 For instruction: store double %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 96 for VF 8 For instruction: store double %v5, ptr %out5, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 -; AVX2: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 42 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction: store double %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 21 for VF 4 For instruction: store double %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 42 for VF 8 For instruction: store double %v5, ptr %out5, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 -; AVX512: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 25 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 102 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 204 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 17 for VF 2 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 25 for VF 4 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 51 for VF 8 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 102 for VF 16 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 204 for VF 32 For instruction: store double %v5, ptr %out5, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll index f195c6adf7743..555bbe8e44269 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll @@ -14,28 +14,32 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 -; SSE2: Cost of 22 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 +; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction: store double %v6, ptr %out6, align 8 +; SSE2: LV: Found an estimated cost of 44 for VF 4 For instruction: store double %v6, ptr %out6, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 -; AVX1: Cost of 23 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 104 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 +; AVX1: LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v6, ptr %out6, align 8 +; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction: store double %v6, ptr %out6, align 8 +; AVX1: LV: Found an estimated cost of 104 for VF 8 For instruction: store double %v6, ptr %out6, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 -; AVX2: Cost of 23 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 104 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 +; AVX2: LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v6, ptr %out6, align 8 +; AVX2: LV: Found an estimated cost of 52 for VF 4 For instruction: store double %v6, ptr %out6, align 8 +; AVX2: LV: Found an estimated cost of 104 for VF 8 For instruction: store double %v6, ptr %out6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 -; AVX512: Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 40 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 70 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 140 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 280 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 70 for VF 8 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: store double %v6, ptr %out6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll index d21798f5bb70e..5a48776cdad2f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 -; SSE2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 68 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 +; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2 +; SSE2: LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2 +; SSE2: LV: Found an estimated cost of 34 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2 +; SSE2: LV: Found an estimated cost of 68 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 -; AVX1: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX1: LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX1: LV: Found an estimated cost of 34 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX1: LV: Found an estimated cost of 70 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX1: LV: Found an estimated cost of 140 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 -; AVX2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 4 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 6 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 12 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX2: LV: Found an estimated cost of 6 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX2: LV: Found an estimated cost of 12 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 -; AVX512DQ: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 4 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 10 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 284 for VF 64: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512DQ: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512DQ: LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512DQ: LV: Found an estimated cost of 4 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512DQ: LV: Found an estimated cost of 5 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512DQ: LV: Found an estimated cost of 10 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512DQ: LV: Found an estimated cost of 284 for VF 64 For instruction: store i16 %v1, ptr %out1, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 -; AVX512BW: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 3 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 7 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 14 for VF 64: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512BW: LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512BW: LV: Found an estimated cost of 3 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512BW: LV: Found an estimated cost of 3 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512BW: LV: Found an estimated cost of 3 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512BW: LV: Found an estimated cost of 7 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2 +; AVX512BW: LV: Found an estimated cost of 14 for VF 64 For instruction: store i16 %v1, ptr %out1, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll index a216313cf53ee..2cfb488d5007f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 -; SSE2: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 26 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 102 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 +; SSE2: LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2 +; SSE2: LV: Found an estimated cost of 26 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2 +; SSE2: LV: Found an estimated cost of 51 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2 +; SSE2: LV: Found an estimated cost of 102 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 -; AVX1: Cost of 15 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 29 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 52 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 105 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 210 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX1: LV: Found an estimated cost of 29 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX1: LV: Found an estimated cost of 52 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX1: LV: Found an estimated cost of 105 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX1: LV: Found an estimated cost of 210 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 -; AVX2: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 30 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 60 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 -; AVX512DQ: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 15 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 29 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 57 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 426 for VF 64: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512DQ: LV: Found an estimated cost of 7 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512DQ: LV: Found an estimated cost of 9 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512DQ: LV: Found an estimated cost of 15 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512DQ: LV: Found an estimated cost of 29 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512DQ: LV: Found an estimated cost of 57 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512DQ: LV: Found an estimated cost of 426 for VF 64 For instruction: store i16 %v2, ptr %out2, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 -; AVX512BW: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 18 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 36 for VF 64: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512BW: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512BW: LV: Found an estimated cost of 6 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512BW: LV: Found an estimated cost of 6 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512BW: LV: Found an estimated cost of 12 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512BW: LV: Found an estimated cost of 18 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2 +; AVX512BW: LV: Found an estimated cost of 36 for VF 64 For instruction: store i16 %v2, ptr %out2, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll index b4d57ca288256..2e4594fac2361 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 -; SSE2: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 34 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 68 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 136 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 +; SSE2: LV: Found an estimated cost of 17 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2 +; SSE2: LV: Found an estimated cost of 34 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2 +; SSE2: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2 +; SSE2: LV: Found an estimated cost of 136 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 -; AVX1: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 34 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 68 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 140 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 280 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX1: LV: Found an estimated cost of 34 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX1: LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX1: LV: Found an estimated cost of 140 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX1: LV: Found an estimated cost of 280 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 -; AVX2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 36 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 72 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX2: LV: Found an estimated cost of 72 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 -; AVX512DQ: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 34 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 68 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 568 for VF 64: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512DQ: LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512DQ: LV: Found an estimated cost of 7 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512DQ: LV: Found an estimated cost of 11 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512DQ: LV: Found an estimated cost of 34 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512DQ: LV: Found an estimated cost of 68 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512DQ: LV: Found an estimated cost of 568 for VF 64 For instruction: store i16 %v3, ptr %out3, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 -; AVX512BW: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 17 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 34 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 68 for VF 64: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512BW: LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512BW: LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512BW: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512BW: LV: Found an estimated cost of 17 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512BW: LV: Found an estimated cost of 34 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2 +; AVX512BW: LV: Found an estimated cost of 68 for VF 64 For instruction: store i16 %v3, ptr %out3, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll index bbe249e11501a..f536f4438649d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 -; SSE2: Cost of 22 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 43 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 85 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 170 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 +; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2 +; SSE2: LV: Found an estimated cost of 43 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2 +; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2 +; SSE2: LV: Found an estimated cost of 170 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 -; AVX1: Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 86 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 175 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 350 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX1: LV: Found an estimated cost of 44 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX1: LV: Found an estimated cost of 86 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX1: LV: Found an estimated cost of 175 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX1: LV: Found an estimated cost of 350 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 -; AVX2: Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 86 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 175 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 350 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX2: LV: Found an estimated cost of 44 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX2: LV: Found an estimated cost of 86 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX2: LV: Found an estimated cost of 175 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX2: LV: Found an estimated cost of 350 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 -; AVX512DQ: Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 47 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 86 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 176 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 355 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 710 for VF 64: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512DQ: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512DQ: LV: Found an estimated cost of 47 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512DQ: LV: Found an estimated cost of 86 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512DQ: LV: Found an estimated cost of 176 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512DQ: LV: Found an estimated cost of 355 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512DQ: LV: Found an estimated cost of 710 for VF 64 For instruction: store i16 %v4, ptr %out4, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 -; AVX512BW: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 22 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 33 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 55 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 110 for VF 64: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512BW: LV: Found an estimated cost of 11 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512BW: LV: Found an estimated cost of 11 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512BW: LV: Found an estimated cost of 22 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512BW: LV: Found an estimated cost of 33 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512BW: LV: Found an estimated cost of 55 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2 +; AVX512BW: LV: Found an estimated cost of 110 for VF 64 For instruction: store i16 %v4, ptr %out4, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll index 2b5ad53a13244..1b7522d01ae2e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 -; SSE2: Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 51 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 102 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 204 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 +; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2 +; SSE2: LV: Found an estimated cost of 51 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2 +; SSE2: LV: Found an estimated cost of 102 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2 +; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 -; AVX1: Cost of 29 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 102 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 210 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 420 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX1: LV: Found an estimated cost of 29 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX1: LV: Found an estimated cost of 102 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX1: LV: Found an estimated cost of 210 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX1: LV: Found an estimated cost of 420 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 -; AVX2: Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 102 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX2: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX2: LV: Found an estimated cost of 24 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX2: LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX2: LV: Found an estimated cost of 102 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 -; AVX512DQ: Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 23 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 61 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 852 for VF 64: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512DQ: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512DQ: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512DQ: LV: Found an estimated cost of 23 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512DQ: LV: Found an estimated cost of 61 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512DQ: LV: Found an estimated cost of 96 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512DQ: LV: Found an estimated cost of 852 for VF 64 For instruction: store i16 %v5, ptr %out5, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 -; AVX512BW: Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 13 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 27 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 81 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 162 for VF 64: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512BW: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512BW: LV: Found an estimated cost of 13 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512BW: LV: Found an estimated cost of 27 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512BW: LV: Found an estimated cost of 40 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512BW: LV: Found an estimated cost of 81 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2 +; AVX512BW: LV: Found an estimated cost of 162 for VF 64 For instruction: store i16 %v5, ptr %out5, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll index 781aa39a2d961..f6ed77887c802 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 -; SSE2: Cost of 33 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 119 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 238 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 +; SSE2: LV: Found an estimated cost of 33 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2 +; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2 +; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2 +; SSE2: LV: Found an estimated cost of 238 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 -; AVX1: Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 120 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 245 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 490 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX1: LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX1: LV: Found an estimated cost of 63 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX1: LV: Found an estimated cost of 120 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX1: LV: Found an estimated cost of 245 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX1: LV: Found an estimated cost of 490 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 -; AVX2: Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 120 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 245 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 490 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX2: LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX2: LV: Found an estimated cost of 63 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX2: LV: Found an estimated cost of 120 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX2: LV: Found an estimated cost of 245 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX2: LV: Found an estimated cost of 490 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 -; AVX512DQ: Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 65 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 122 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 246 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 497 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 994 for VF 64: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512DQ: LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512DQ: LV: Found an estimated cost of 65 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512DQ: LV: Found an estimated cost of 122 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512DQ: LV: Found an estimated cost of 246 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512DQ: LV: Found an estimated cost of 497 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512DQ: LV: Found an estimated cost of 994 for VF 64 For instruction: store i16 %v6, ptr %out6, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 -; AVX512BW: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 112 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 224 for VF 64: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512BW: LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512BW: LV: Found an estimated cost of 16 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512BW: LV: Found an estimated cost of 32 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512BW: LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512BW: LV: Found an estimated cost of 112 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2 +; AVX512BW: LV: Found an estimated cost of 224 for VF 64 For instruction: store i16 %v6, ptr %out6, align 2 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-8.ll index f524cee862167..cc82d48fadb2c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-8.ll @@ -15,6 +15,7 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 ; SSE2: LV: Found an estimated cost of 34 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2 ; SSE2: LV: Found an estimated cost of 68 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2 ; SSE2: LV: Found an estimated cost of 136 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2 @@ -22,6 +23,7 @@ define void @test() { ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX1: LV: Found an estimated cost of 34 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX1: LV: Found an estimated cost of 68 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX1: LV: Found an estimated cost of 136 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2 @@ -30,6 +32,7 @@ define void @test() { ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX2: LV: Found an estimated cost of 34 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX2: LV: Found an estimated cost of 68 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX2: LV: Found an estimated cost of 136 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2 @@ -38,6 +41,7 @@ define void @test() { ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX512DQ: LV: Found an estimated cost of 34 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX512DQ: LV: Found an estimated cost of 68 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX512DQ: LV: Found an estimated cost of 136 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2 @@ -47,6 +51,7 @@ define void @test() { ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX512BW: LV: Found an estimated cost of 18 for VF 2 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX512BW: LV: Found an estimated cost of 18 for VF 4 For instruction: store i16 %v7, ptr %out7, align 2 ; AVX512BW: LV: Found an estimated cost of 37 for VF 8 For instruction: store i16 %v7, ptr %out7, align 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll index e27e45c753dc8..0a33a96bb6ba1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4 -; SSE2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 30 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 60 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 120 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4 +; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4 +; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4 +; SSE2: LV: Found an estimated cost of 60 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4 +; SSE2: LV: Found an estimated cost of 120 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4 -; AVX1: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 38 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 76 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 152 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 18 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 38 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 76 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX1: LV: Found an estimated cost of 152 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4 -; AVX2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 12 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX2: LV: Found an estimated cost of 24 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4 -; AVX512: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 10 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 20 for VF 64: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 2 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 10 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4 +; AVX512: LV: Found an estimated cost of 20 for VF 64 For instruction: store i32 %v1, ptr %out1, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll index 0c0e746cb503d..ad8eca60b43bd 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4 -; SSE2: Cost of 23 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 48 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 96 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 192 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4 +; SSE2: LV: Found an estimated cost of 23 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4 +; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4 +; SSE2: LV: Found an estimated cost of 96 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4 +; SSE2: LV: Found an estimated cost of 192 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4 -; AVX1: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 57 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 114 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 228 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 28 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 57 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 114 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX1: LV: Found an estimated cost of 228 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4 -; AVX2: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 28 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 60 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 28 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX2: LV: Found an estimated cost of 60 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4 -; AVX512: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 48 for VF 64: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 12 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 24 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4 +; AVX512: LV: Found an estimated cost of 48 for VF 64 For instruction: store i32 %v2, ptr %out2, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll index 0d4823d965ca4..6c86f56a2da3c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4 -; SSE2: Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 120 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 240 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4 +; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4 +; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4 +; SSE2: LV: Found an estimated cost of 120 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4 +; SSE2: LV: Found an estimated cost of 240 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4 -; AVX1: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 76 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 152 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 304 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 18 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 36 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 76 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 152 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX1: LV: Found an estimated cost of 304 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4 -; AVX2: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 20 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 80 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 40 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX2: LV: Found an estimated cost of 80 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4 -; AVX512: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 22 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 44 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 88 for VF 64: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 5 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 44 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4 +; AVX512: LV: Found an estimated cost of 88 for VF 64 For instruction: store i32 %v3, ptr %out3, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll index 4f04f41daec71..f4fbbec3a46f5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll @@ -14,32 +14,36 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4 -; SSE2: Cost of 40 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 84 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 168 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4 +; SSE2: LV: Found an estimated cost of 40 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4 +; SSE2: LV: Found an estimated cost of 84 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4 +; SSE2: LV: Found an estimated cost of 168 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4 -; AVX1: Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 95 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 190 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX1: LV: Found an estimated cost of 46 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX1: LV: Found an estimated cost of 95 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX1: LV: Found an estimated cost of 190 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4 -; AVX2: Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 95 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 190 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX2: LV: Found an estimated cost of 46 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX2: LV: Found an estimated cost of 95 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX2: LV: Found an estimated cost of 190 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4 -; AVX512: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 21 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 35 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 70 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 140 for VF 64: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 7 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 21 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 35 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 70 for VF 32 For instruction: store i32 %v4, ptr %out4, align 4 +; AVX512: LV: Found an estimated cost of 140 for VF 64 For instruction: store i32 %v4, ptr %out4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll index 9e7570850976b..4f35f667276d8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll @@ -14,32 +14,36 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4 -; SSE2: Cost of 45 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 96 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 192 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4 +; SSE2: LV: Found an estimated cost of 45 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4 +; SSE2: LV: Found an estimated cost of 96 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4 +; SSE2: LV: Found an estimated cost of 192 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4 -; AVX1: Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 54 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 114 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 228 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX1: LV: Found an estimated cost of 28 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX1: LV: Found an estimated cost of 54 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX1: LV: Found an estimated cost of 114 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX1: LV: Found an estimated cost of 228 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4 -; AVX2: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 15 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 39 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 78 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX2: LV: Found an estimated cost of 39 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX2: LV: Found an estimated cost of 78 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4 -; AVX512: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 25 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 51 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 102 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 204 for VF 64: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 25 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 102 for VF 32 For instruction: store i32 %v5, ptr %out5, align 4 +; AVX512: LV: Found an estimated cost of 204 for VF 64 For instruction: store i32 %v5, ptr %out5, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll index 70b60cb815fec..9a4f580a7a2ca 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll @@ -14,31 +14,35 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4 -; SSE2: Cost of 51 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 108 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 216 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4 +; SSE2: LV: Found an estimated cost of 51 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4 +; SSE2: LV: Found an estimated cost of 108 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4 +; SSE2: LV: Found an estimated cost of 216 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4 -; AVX1: Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 64 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 133 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 266 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX1: LV: Found an estimated cost of 35 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX1: LV: Found an estimated cost of 64 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX1: LV: Found an estimated cost of 133 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX1: LV: Found an estimated cost of 266 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4 -; AVX2: Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 64 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 133 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 266 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX2: LV: Found an estimated cost of 35 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX2: LV: Found an estimated cost of 133 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX2: LV: Found an estimated cost of 266 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4 -; AVX512: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 20 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 70 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4 +; AVX512: LV: Found an estimated cost of 140 for VF 32 For instruction: store i32 %v6, ptr %out6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-8.ll index 570431577cf46..2c4ca9993f53d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-8.ll @@ -14,12 +14,14 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4 ; SSE2: LV: Found an estimated cost of 56 for VF 2 For instruction: store i32 %v7, ptr %out7, align 4 ; SSE2: LV: Found an estimated cost of 120 for VF 4 For instruction: store i32 %v7, ptr %out7, align 4 ; SSE2: LV: Found an estimated cost of 240 for VF 8 For instruction: store i32 %v7, ptr %out7, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX1: LV: Found an estimated cost of 36 for VF 2 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX1: LV: Found an estimated cost of 72 for VF 4 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX1: LV: Found an estimated cost of 152 for VF 8 For instruction: store i32 %v7, ptr %out7, align 4 @@ -27,6 +29,7 @@ define void @test() { ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX2: LV: Found an estimated cost of 36 for VF 2 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX2: LV: Found an estimated cost of 72 for VF 4 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX2: LV: Found an estimated cost of 152 for VF 8 For instruction: store i32 %v7, ptr %out7, align 4 @@ -34,6 +37,7 @@ define void @test() { ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX512: LV: Found an estimated cost of 23 for VF 4 For instruction: store i32 %v7, ptr %out7, align 4 ; AVX512: LV: Found an estimated cost of 46 for VF 8 For instruction: store i32 %v7, ptr %out7, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll index 8352b2f415708..6fbc678408f6b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll @@ -14,35 +14,39 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 -; SSE2: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 -; AVX1: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 22 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 44 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 88 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 176 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 10 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 22 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 44 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 88 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 176 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 -; AVX2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 24 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 48 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 48 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 -; AVX512: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 10 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 20 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512: Cost of 40 for VF 64: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 2 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 64 For instruction: store i64 %v1, ptr %out1, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll index 5b5d1ddb6a61c..fe1dad3c3effc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll @@ -14,32 +14,36 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 -; SSE2: Cost of 22 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 44 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 88 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX1: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 33 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 66 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 16 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 33 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 66 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 132 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX2: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 18 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 36 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX512: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 24 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 48 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512: Cost of 96 for VF 64: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 48 for VF 32 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 96 for VF 64 For instruction: store i64 %v2, ptr %out2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll index 78dbca343a217..4ebb2283f9b7f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll @@ -14,31 +14,35 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 -; SSE2: Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 56 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 112 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 -; AVX1: Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 176 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 20 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 44 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 88 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 176 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 -; AVX2: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 -; AVX512: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 22 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 44 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512: Cost of 88 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 11 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 44 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 88 for VF 32 For instruction: store i64 %v3, ptr %out3, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll index 4b7fe9d580d0d..79c64ecfe457d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll @@ -14,28 +14,32 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 -; SSE2: Cost of 38 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 76 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 38 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 76 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 -; AVX1: Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 55 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 110 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 55 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 -; AVX2: Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 55 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 110 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 55 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 110 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 -; AVX512: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 35 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512: Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 14 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 21 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 35 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 70 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 140 for VF 32 For instruction: store i64 %v4, ptr %out4, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll index e8f66d10499e7..05909fa7a1fb9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll @@ -14,28 +14,32 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 -; SSE2: Cost of 44 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 88 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 44 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 -; AVX1: Cost of 30 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 66 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 132 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 66 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 132 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 -; AVX2: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 42 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 11 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 21 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 42 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 -; AVX512: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 25 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 102 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512: Cost of 204 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 17 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 25 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 51 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 102 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 204 for VF 32 For instruction: store i64 %v5, ptr %out5, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll index 7e9e4347cd665..881c7867614b7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll @@ -14,28 +14,32 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 -; SSE2: Cost of 50 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 100 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 +; SSE2: LV: Found an estimated cost of 50 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8 +; SSE2: LV: Found an estimated cost of 100 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX1: Cost of 36 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 77 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 154 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX1: LV: Found an estimated cost of 36 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX1: LV: Found an estimated cost of 77 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX1: LV: Found an estimated cost of 154 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX2: Cost of 36 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 77 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 154 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX2: LV: Found an estimated cost of 36 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX2: LV: Found an estimated cost of 77 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX2: LV: Found an estimated cost of 154 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX512: Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 40 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 70 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 140 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512: Cost of 280 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 70 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: store i64 %v6, ptr %out6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll index eac3d14df3c40..87b5f4d3858dc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 -; SSE2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; SSE2: Cost of 126 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 +; SSE2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1 +; SSE2: LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1 +; SSE2: LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1 +; SSE2: LV: Found an estimated cost of 126 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 -; AVX1: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 66 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX1: Cost of 134 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX1: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX1: LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX1: LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX1: LV: Found an estimated cost of 66 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX1: LV: Found an estimated cost of 134 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 -; AVX2: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX2: Cost of 6 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX2: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX2: LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX2: LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX2: LV: Found an estimated cost of 6 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 -; AVX512DQ: Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 5 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512DQ: Cost of 270 for VF 64: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512DQ: LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512DQ: LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512DQ: LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512DQ: LV: Found an estimated cost of 4 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512DQ: LV: Found an estimated cost of 5 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512DQ: LV: Found an estimated cost of 270 for VF 64 For instruction: store i8 %v1, ptr %out1, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 -; AVX512BW: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 4 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 20 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%out0> -; AVX512BW: Cost of 41 for VF 64: INTERLEAVE-GROUP with factor 2 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512BW: LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512BW: LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512BW: LV: Found an estimated cost of 4 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512BW: LV: Found an estimated cost of 8 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512BW: LV: Found an estimated cost of 20 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1 +; AVX512BW: LV: Found an estimated cost of 41 for VF 64 For instruction: store i8 %v1, ptr %out1, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll index e2fa7eb32fe75..9a7503b63f9ff 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 -; SSE2: Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 101 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; SSE2: Cost of 204 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 +; SSE2: LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1 +; SSE2: LV: Found an estimated cost of 52 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1 +; SSE2: LV: Found an estimated cost of 101 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1 +; SSE2: LV: Found an estimated cost of 204 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 -; AVX1: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 27 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 53 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 100 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX1: Cost of 201 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX1: LV: Found an estimated cost of 16 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX1: LV: Found an estimated cost of 27 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX1: LV: Found an estimated cost of 53 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX1: LV: Found an estimated cost of 100 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX1: LV: Found an estimated cost of 201 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 -; AVX2: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX2: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX2: LV: Found an estimated cost of 9 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX2: LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 -; AVX512DQ: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 15 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512DQ: Cost of 405 for VF 64: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512DQ: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512DQ: LV: Found an estimated cost of 7 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512DQ: LV: Found an estimated cost of 9 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512DQ: LV: Found an estimated cost of 14 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512DQ: LV: Found an estimated cost of 15 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512DQ: LV: Found an estimated cost of 405 for VF 64 For instruction: store i8 %v2, ptr %out2, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 -; AVX512BW: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%out0> -; AVX512BW: Cost of 29 for VF 64: INTERLEAVE-GROUP with factor 3 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512BW: LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512BW: LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512BW: LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512BW: LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512BW: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1 +; AVX512BW: LV: Found an estimated cost of 29 for VF 64 For instruction: store i8 %v2, ptr %out2, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll index 21f010df4ee0f..e072d707e23e5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 -; SSE2: Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 124 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; SSE2: Cost of 252 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 +; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1 +; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1 +; SSE2: LV: Found an estimated cost of 124 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1 +; SSE2: LV: Found an estimated cost of 252 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 -; AVX1: Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 33 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 66 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX1: Cost of 268 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX1: LV: Found an estimated cost of 17 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX1: LV: Found an estimated cost of 33 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX1: LV: Found an estimated cost of 66 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX1: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX1: LV: Found an estimated cost of 268 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 -; AVX2: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 10 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX2: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX2: LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX2: LV: Found an estimated cost of 5 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX2: LV: Found an estimated cost of 5 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX2: LV: Found an estimated cost of 10 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 -; AVX512DQ: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512DQ: Cost of 540 for VF 64: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512DQ: LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512DQ: LV: Found an estimated cost of 5 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512DQ: LV: Found an estimated cost of 5 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512DQ: LV: Found an estimated cost of 9 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512DQ: LV: Found an estimated cost of 14 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512DQ: LV: Found an estimated cost of 540 for VF 64 For instruction: store i8 %v3, ptr %out3, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 -; AVX512BW: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%out0> -; AVX512BW: Cost of 28 for VF 64: INTERLEAVE-GROUP with factor 4 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512BW: LV: Found an estimated cost of 11 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512BW: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512BW: LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512BW: LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512BW: LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1 +; AVX512BW: LV: Found an estimated cost of 28 for VF 64 For instruction: store i8 %v3, ptr %out3, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll index 0ea9060365f58..1c079204cc3bb 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 -; SSE2: Cost of 44 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 87 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 178 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; SSE2: Cost of 360 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 +; SSE2: LV: Found an estimated cost of 44 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1 +; SSE2: LV: Found an estimated cost of 87 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1 +; SSE2: LV: Found an estimated cost of 178 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1 +; SSE2: LV: Found an estimated cost of 360 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 -; AVX1: Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 84 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 166 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX1: Cost of 335 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX1: LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX1: LV: Found an estimated cost of 84 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX1: LV: Found an estimated cost of 166 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX1: LV: Found an estimated cost of 335 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 -; AVX2: Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 84 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 166 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX2: Cost of 335 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX2: LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX2: LV: Found an estimated cost of 84 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX2: LV: Found an estimated cost of 166 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX2: LV: Found an estimated cost of 335 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 -; AVX512DQ: Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 87 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 166 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 336 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512DQ: Cost of 675 for VF 64: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512DQ: LV: Found an estimated cost of 24 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512DQ: LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512DQ: LV: Found an estimated cost of 87 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512DQ: LV: Found an estimated cost of 166 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512DQ: LV: Found an estimated cost of 336 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512DQ: LV: Found an estimated cost of 675 for VF 64 For instruction: store i8 %v4, ptr %out4, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 -; AVX512BW: Cost of 15 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 31 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 79 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 158 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 237 for VF 32: INTERLEAVE-GROUP with factor 5 at , ir<%out0> -; AVX512BW: Cost of 395 for VF 64: INTERLEAVE-GROUP with factor 5 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512BW: LV: Found an estimated cost of 15 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512BW: LV: Found an estimated cost of 31 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512BW: LV: Found an estimated cost of 79 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512BW: LV: Found an estimated cost of 158 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512BW: LV: Found an estimated cost of 237 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1 +; AVX512BW: LV: Found an estimated cost of 395 for VF 64 For instruction: store i8 %v4, ptr %out4, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll index 318af2369c5f4..0b260d46b9173 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 -; SSE2: Cost of 49 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 98 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 201 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; SSE2: Cost of 408 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 +; SSE2: LV: Found an estimated cost of 49 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1 +; SSE2: LV: Found an estimated cost of 98 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1 +; SSE2: LV: Found an estimated cost of 201 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1 +; SSE2: LV: Found an estimated cost of 408 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 -; AVX1: Cost of 27 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 53 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 100 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 198 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX1: Cost of 402 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX1: LV: Found an estimated cost of 53 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX1: LV: Found an estimated cost of 100 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX1: LV: Found an estimated cost of 198 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX1: LV: Found an estimated cost of 402 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 -; AVX2: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 18 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 30 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX2: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX2: LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX2: LV: Found an estimated cost of 96 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 -; AVX512DQ: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 19 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 29 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 93 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512DQ: Cost of 810 for VF 64: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512DQ: LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512DQ: LV: Found an estimated cost of 12 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512DQ: LV: Found an estimated cost of 19 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512DQ: LV: Found an estimated cost of 29 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512DQ: LV: Found an estimated cost of 93 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512DQ: LV: Found an estimated cost of 810 for VF 64 For instruction: store i8 %v5, ptr %out5, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 -; AVX512BW: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 38 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 98 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 197 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 295 for VF 32: INTERLEAVE-GROUP with factor 6 at , ir<%out0> -; AVX512BW: Cost of 591 for VF 64: INTERLEAVE-GROUP with factor 6 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512BW: LV: Found an estimated cost of 18 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512BW: LV: Found an estimated cost of 38 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512BW: LV: Found an estimated cost of 98 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512BW: LV: Found an estimated cost of 197 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512BW: LV: Found an estimated cost of 295 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1 +; AVX512BW: LV: Found an estimated cost of 591 for VF 64 For instruction: store i8 %v5, ptr %out5, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll index 659d4308a0b5e..b69559c6dae62 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll @@ -15,44 +15,49 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 -; SSE2: Cost of 57 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 112 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 225 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; SSE2: Cost of 456 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 +; SSE2: LV: Found an estimated cost of 57 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1 +; SSE2: LV: Found an estimated cost of 112 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1 +; SSE2: LV: Found an estimated cost of 225 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1 +; SSE2: LV: Found an estimated cost of 456 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 -; AVX1: Cost of 34 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 119 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 232 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX1: Cost of 469 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX1: LV: Found an estimated cost of 34 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX1: LV: Found an estimated cost of 63 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX1: LV: Found an estimated cost of 119 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX1: LV: Found an estimated cost of 232 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX1: LV: Found an estimated cost of 469 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 -; AVX2: Cost of 34 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 119 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 232 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX2: Cost of 469 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX2: LV: Found an estimated cost of 34 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX2: LV: Found an estimated cost of 63 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX2: LV: Found an estimated cost of 119 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX2: LV: Found an estimated cost of 232 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX2: LV: Found an estimated cost of 469 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 -; AVX512DQ: Cost of 34 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 121 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 234 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 470 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512DQ: Cost of 945 for VF 64: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512DQ: LV: Found an estimated cost of 34 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512DQ: LV: Found an estimated cost of 63 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512DQ: LV: Found an estimated cost of 121 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512DQ: LV: Found an estimated cost of 234 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512DQ: LV: Found an estimated cost of 470 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512DQ: LV: Found an estimated cost of 945 for VF 64 For instruction: store i8 %v6, ptr %out6, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 -; AVX512BW: Cost of 22 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 118 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 236 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 472 for VF 32: INTERLEAVE-GROUP with factor 7 at , ir<%out0> -; AVX512BW: Cost of 826 for VF 64: INTERLEAVE-GROUP with factor 7 at , ir<%out0> +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512BW: LV: Found an estimated cost of 22 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512BW: LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512BW: LV: Found an estimated cost of 118 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512BW: LV: Found an estimated cost of 236 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512BW: LV: Found an estimated cost of 472 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1 +; AVX512BW: LV: Found an estimated cost of 826 for VF 64 For instruction: store i8 %v6, ptr %out6, align 1 ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-8.ll index c066a686efc43..078528b58f6f7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-8.ll @@ -15,6 +15,7 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 ; SSE2: LV: Found an estimated cost of 56 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1 ; SSE2: LV: Found an estimated cost of 120 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1 ; SSE2: LV: Found an estimated cost of 248 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1 @@ -22,6 +23,7 @@ define void @test() { ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX1: LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX1: LV: Found an estimated cost of 66 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX1: LV: Found an estimated cost of 132 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1 @@ -30,6 +32,7 @@ define void @test() { ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX2: LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX2: LV: Found an estimated cost of 66 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX2: LV: Found an estimated cost of 132 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1 @@ -38,6 +41,7 @@ define void @test() { ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX512DQ: LV: Found an estimated cost of 33 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX512DQ: LV: Found an estimated cost of 66 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX512DQ: LV: Found an estimated cost of 132 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1 @@ -47,6 +51,7 @@ define void @test() { ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX512BW: LV: Found an estimated cost of 25 for VF 2 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX512BW: LV: Found an estimated cost of 53 for VF 4 For instruction: store i8 %v7, ptr %out7, align 1 ; AVX512BW: LV: Found an estimated cost of 137 for VF 8 For instruction: store i8 %v7, ptr %out7, align 1 From ed9107f2d71804f6bedff6cd05b1f1a4750eb112 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Sun, 9 Feb 2025 10:25:25 -0600 Subject: [PATCH 095/293] [OpenMP] Replace use of target address space with local (#126119) Summary: This definition is more portable since it defines the correct value for the target. I got rid of the helper mostly because I think it's easy enough to use now that it's a type and being explicit about what's `undef` or `poison` is good. --- offload/DeviceRTL/include/DeviceTypes.h | 19 +++++----------- offload/DeviceRTL/include/State.h | 4 ++-- offload/DeviceRTL/src/Configuration.cpp | 4 ++-- offload/DeviceRTL/src/Mapping.cpp | 2 +- offload/DeviceRTL/src/Reduction.cpp | 24 ++++++++++---------- offload/DeviceRTL/src/State.cpp | 27 ++++++++++++++--------- offload/DeviceRTL/src/Synchronization.cpp | 2 +- offload/DeviceRTL/src/Workshare.cpp | 5 +++-- 8 files changed, 43 insertions(+), 44 deletions(-) diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h index 308109b0749f0..2e5d92380f040 100644 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -12,9 +12,15 @@ #ifndef OMPTARGET_TYPES_H #define OMPTARGET_TYPES_H +#include #include #include +template using Private = __gpu_private T; +template using Constant = __gpu_constant T; +template using Local = __gpu_local T; +template using Global = __gpu_local T; + enum omp_proc_bind_t { omp_proc_bind_false = 0, omp_proc_bind_true = 1, @@ -155,19 +161,6 @@ typedef enum omp_allocator_handle_t { #define __PRAGMA(STR) _Pragma(#STR) #define OMP_PRAGMA(STR) __PRAGMA(omp STR) -#define SHARED(NAME) \ - [[clang::address_space(3)]] NAME [[clang::loader_uninitialized]]; - -// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right -// now that's not the case. -#define THREAD_LOCAL(NAME) \ - [[clang::address_space(5)]] NAME [[clang::loader_uninitialized]] - -// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it -// does? -#define CONSTANT(NAME) \ - [[clang::address_space(4)]] NAME [[clang::loader_uninitialized]] - ///} #endif diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index 58b619ff1072a..db396dae6e445 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -86,7 +86,7 @@ struct TeamStateTy { ParallelRegionFnTy ParallelRegionFnVar; }; -extern TeamStateTy [[clang::address_space(3)]] TeamState; +extern Local TeamState; struct ThreadStateTy { @@ -112,7 +112,7 @@ struct ThreadStateTy { } }; -extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates; +extern Local ThreadStates; /// Initialize the state machinery. Must be called by all threads. void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp index a2dfa4a02a094..0c31c66ab2deb 100644 --- a/offload/DeviceRTL/src/Configuration.cpp +++ b/offload/DeviceRTL/src/Configuration.cpp @@ -28,8 +28,8 @@ using namespace ompx; // This variable should be visible to the plugin so we override the default // hidden visibility. [[gnu::used, gnu::retain, gnu::weak, - gnu::visibility("protected")]] DeviceEnvironmentTy - CONSTANT(__omp_rtl_device_environment); + gnu::visibility( + "protected")]] Constant __omp_rtl_device_environment; uint32_t config::getAssumeTeamsOversubscription() { return __omp_rtl_assume_teams_oversubscription; diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index a0c0f6721a84c..641be81cca3ed 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -308,7 +308,7 @@ uint32_t mapping::getNumberOfProcessorElements() { // TODO: This is a workaround for initialization coming from kernels outside of // the TU. We will need to solve this more correctly in the future. -[[gnu::weak]] int SHARED(IsSPMDMode); +[[gnu::weak, clang::loader_uninitialized]] Local IsSPMDMode; void mapping::init(bool IsSPMD) { if (mapping::isInitialThreadInLevel0(IsSPMD)) diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index 25f34005532f7..fffd0063940c6 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -71,16 +71,16 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, if (NumThreads == 1) return 1; - // - // This reduce function handles reduction within a team. It handles - // parallel regions in both L1 and L2 parallelism levels. It also - // supports Generic, SPMD, and NoOMP modes. - // - // 1. Reduce within a warp. - // 2. Warp master copies value to warp 0 via shared memory. - // 3. Warp 0 reduces to a single value. - // 4. The reduced value is available in the thread that returns 1. - // + // + // This reduce function handles reduction within a team. It handles + // parallel regions in both L1 and L2 parallelism levels. It also + // supports Generic, SPMD, and NoOMP modes. + // + // 1. Reduce within a warp. + // 2. Warp master copies value to warp 0 via shared memory. + // 3. Warp 0 reduces to a single value. + // 4. The reduced value is available in the thread that returns 1. + // #if __has_builtin(__nvvm_reflect) if (__nvvm_reflect("__CUDA_ARCH") >= 700) { @@ -196,8 +196,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2( uint32_t NumThreads = omp_get_num_threads(); uint32_t TeamId = omp_get_team_num(); uint32_t NumTeams = omp_get_num_teams(); - static unsigned SHARED(Bound); - static unsigned SHARED(ChunkTeamCount); + [[clang::loader_uninitialized]] static Local Bound; + [[clang::loader_uninitialized]] static Local ChunkTeamCount; // Block progress for teams greater than the current upper // limit. We always only allow a number of teams less or equal diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 89edb4802198c..cbe9735145340 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -28,15 +28,17 @@ using namespace ompx; ///{ /// External symbol to access dynamic shared memory. -[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char - [[clang::address_space(3)]] DynamicSharedBuffer[]; +[[gnu::aligned( + allocator::ALIGNMENT)]] extern Local DynamicSharedBuffer[]; /// The kernel environment passed to the init method by the compiler. -static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr); +[[clang::loader_uninitialized]] static Local + KernelEnvironmentPtr; /// The kernel launch environment passed as argument to the kernel by the /// runtime. -static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr); +[[clang::loader_uninitialized]] static Local + KernelLaunchEnvironmentPtr; ///} @@ -108,7 +110,8 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, "Shared scratchpad of this size not supported yet."); /// The allocation of a single shared memory scratchpad. -static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); +[[clang::loader_uninitialized]] static Local + SharedMemorySmartStack; void SharedMemorySmartStackTy::init(bool IsSPMD) { Usage[mapping::getThreadIdInBlock()] = 0; @@ -220,8 +223,10 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { ASSERT(HasThreadState == Other.HasThreadState, nullptr); } -state::TeamStateTy SHARED(ompx::state::TeamState); -state::ThreadStateTy **SHARED(ompx::state::ThreadStates); +[[clang::loader_uninitialized]] Local + ompx::state::TeamState; +[[clang::loader_uninitialized]] Local + ompx::state::ThreadStates; namespace { @@ -449,10 +454,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; -[[clang::loader_uninitialized]] static void *[[clang::address_space( - 3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; -[[clang::loader_uninitialized]] static void **[[clang::address_space( - 3)]] SharedMemVariableSharingSpacePtr; +[[clang::loader_uninitialized]] static Local + SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; +[[clang::loader_uninitialized]] static Local + SharedMemVariableSharingSpacePtr; void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index a5090b96560c8..0854c21ee152a 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -69,7 +69,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, } } -uint32_t SHARED(namedBarrierTracker); +[[clang::loader_uninitialized]] Local namedBarrierTracker; void namedBarrierInit() { // Don't have global ctors, and shared memory is not zero init diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index b1f037a11bddf..de4ed2e2102a6 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -45,7 +45,7 @@ struct DynamicScheduleTracker { #define LAST_CHUNK 2 // TODO: This variable is a hack inherited from the old runtime. -static uint64_t SHARED(Cnt); +[[clang::loader_uninitialized]] static Local Cnt; template struct omptarget_nvptx_LoopSupport { //////////////////////////////////////////////////////////////////////////////// @@ -457,7 +457,8 @@ template struct omptarget_nvptx_LoopSupport { // // __kmpc_dispatch_deinit // -static DynamicScheduleTracker **SHARED(ThreadDST); +[[clang::loader_uninitialized]] static Local + ThreadDST; // Create a new DST, link the current one, and define the new as current. static DynamicScheduleTracker *pushDST() { From 6444ed53658354efb8fc126f93281bc13f1d6300 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 08:55:31 -0800 Subject: [PATCH 096/293] [AST] Avoid repeated hash lookups (NFC) (#126400) --- clang/lib/AST/VTableBuilder.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp index fa3055dd1206f..19d76df99dbe3 100644 --- a/clang/lib/AST/VTableBuilder.cpp +++ b/clang/lib/AST/VTableBuilder.cpp @@ -1169,12 +1169,13 @@ void ItaniumVTableBuilder::ComputeThisAdjustments() { // // Do not set ThunkInfo::Method if Idx is already in VTableThunks. This // can happen when covariant return adjustment is required too. - if (!VTableThunks.count(Idx)) { + auto [It, Inserted] = VTableThunks.try_emplace(Idx); + if (Inserted) { const CXXMethodDecl *Method = VTables.findOriginalMethodInMap(MD); - VTableThunks[Idx].Method = Method; - VTableThunks[Idx].ThisType = Method->getThisType().getTypePtr(); + It->second.Method = Method; + It->second.ThisType = Method->getThisType().getTypePtr(); } - VTableThunks[Idx].This = ThisAdjustment; + It->second.This = ThisAdjustment; }; SetThisAdjustmentThunk(VTableIndex); @@ -1653,8 +1654,9 @@ void ItaniumVTableBuilder::AddMethods( // findOriginalMethod to find the method that created the entry if the // method in the entry requires adjustment. if (!ReturnAdjustment.isEmpty()) { - VTableThunks[Components.size()].Method = MD; - VTableThunks[Components.size()].ThisType = MD->getThisType().getTypePtr(); + auto &VTT = VTableThunks[Components.size()]; + VTT.Method = MD; + VTT.ThisType = MD->getThisType().getTypePtr(); } AddMethod(Overrider.Method, ReturnAdjustment); From c741cf1617c22d18316fd98af1c30dc244eab22e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 08:55:43 -0800 Subject: [PATCH 097/293] [CodeGen] Avoid repeated hash lookups (NFC) (#126403) --- llvm/lib/CodeGen/StackColoring.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp index 27c65d234a618..552dfdfe16ce4 100644 --- a/llvm/lib/CodeGen/StackColoring.cpp +++ b/llvm/lib/CodeGen/StackColoring.cpp @@ -1115,9 +1115,10 @@ void StackColoring::remapInstructions(DenseMap &SlotRemap) { if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo()) for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap) for (WinEHHandlerType &H : TBME.HandlerArray) - if (H.CatchObj.FrameIndex != std::numeric_limits::max() && - SlotRemap.count(H.CatchObj.FrameIndex)) - H.CatchObj.FrameIndex = SlotRemap[H.CatchObj.FrameIndex]; + if (H.CatchObj.FrameIndex != std::numeric_limits::max()) + if (auto It = SlotRemap.find(H.CatchObj.FrameIndex); + It != SlotRemap.end()) + H.CatchObj.FrameIndex = It->second; LLVM_DEBUG(dbgs() << "Fixed " << FixedMemOp << " machine memory operands.\n"); LLVM_DEBUG(dbgs() << "Fixed " << FixedDbg << " debug locations.\n"); From db348c8e8b2472563a8db363b18d2604968ae43b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 08:55:55 -0800 Subject: [PATCH 098/293] [Passes] Avoid repeated hash lookups (NFC) (#126404) --- llvm/include/llvm/Passes/DroppedVariableStats.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Passes/DroppedVariableStats.h b/llvm/include/llvm/Passes/DroppedVariableStats.h index e2e91891e24c1..30fbeae703b03 100644 --- a/llvm/include/llvm/Passes/DroppedVariableStats.h +++ b/llvm/include/llvm/Passes/DroppedVariableStats.h @@ -96,9 +96,10 @@ class DroppedVariableStats { DenseSet &DebugVariablesBeforeSet = DbgVariables.DebugVariablesBefore; DenseSet &DebugVariablesAfterSet = DbgVariables.DebugVariablesAfter; - if (InlinedAts.back().find(FuncName) == InlinedAts.back().end()) + auto It = InlinedAts.back().find(FuncName); + if (It == InlinedAts.back().end()) return; - DenseMap &InlinedAtsMap = InlinedAts.back()[FuncName]; + DenseMap &InlinedAtsMap = It->second; // Find an Instruction that shares the same scope as the dropped #dbg_value // or has a scope that is the child of the scope of the #dbg_value, and has // an inlinedAt equal to the inlinedAt of the #dbg_value or it's inlinedAt From 87ae9547ea0f590f8adbbdfaeca28ef999ddffa8 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 08:56:17 -0800 Subject: [PATCH 099/293] [TableGen] Avoid repeated hash lookups (NFC) (#126405) --- llvm/utils/TableGen/PseudoLoweringEmitter.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp index bcbc6ea20751f..6d39d1496c09e 100644 --- a/llvm/utils/TableGen/PseudoLoweringEmitter.cpp +++ b/llvm/utils/TableGen/PseudoLoweringEmitter.cpp @@ -79,8 +79,9 @@ unsigned PseudoLoweringEmitter::addDagOperandMapping( // "zero_reg" definition. if (DI->getDef()->isSubClassOf("Register") || DI->getDef()->getName() == "zero_reg") { - OperandMap[BaseIdx + i].Kind = OpData::Reg; - OperandMap[BaseIdx + i].Data.Reg = DI->getDef(); + auto &Entry = OperandMap[BaseIdx + i]; + Entry.Kind = OpData::Reg; + Entry.Data.Reg = DI->getDef(); ++OpsAdded; continue; } @@ -105,12 +106,14 @@ unsigned PseudoLoweringEmitter::addDagOperandMapping( OperandMap[BaseIdx + i + I].Kind = OpData::Operand; OpsAdded += Insn.Operands[i].MINumOperands; } else if (const IntInit *II = dyn_cast(Dag->getArg(i))) { - OperandMap[BaseIdx + i].Kind = OpData::Imm; - OperandMap[BaseIdx + i].Data.Imm = II->getValue(); + auto &Entry = OperandMap[BaseIdx + i]; + Entry.Kind = OpData::Imm; + Entry.Data.Imm = II->getValue(); ++OpsAdded; } else if (const auto *BI = dyn_cast(Dag->getArg(i))) { - OperandMap[BaseIdx + i].Kind = OpData::Imm; - OperandMap[BaseIdx + i].Data.Imm = *BI->convertInitializerToInt(); + auto &Entry = OperandMap[BaseIdx + i]; + Entry.Kind = OpData::Imm; + Entry.Data.Imm = *BI->convertInitializerToInt(); ++OpsAdded; } else if (const DagInit *SubDag = dyn_cast(Dag->getArg(i))) { // Just add the operands recursively. This is almost certainly From 4972722f90deddf45c29958070bb1beb509e72ac Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 9 Feb 2025 17:11:18 +0000 Subject: [PATCH 100/293] [X86] lowerV4F64Shuffle - prefer lowerShuffleAsDecomposedShuffleMerge if we're blending inplace/splatable shuffle inputs on AVX2 targets (#126420) More aggressively use broadcast instructions where possible Fixes #50315 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 ++++- .../copy-low-subvec-elt-to-high-subvec-elt.ll | 4 +- llvm/test/CodeGen/X86/horizontal-sum.ll | 20 ++--- llvm/test/CodeGen/X86/matrix-multiply.ll | 82 +++++++++---------- .../X86/vector-shuffle-combining-avx.ll | 38 ++++----- 5 files changed, 91 insertions(+), 73 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 744e4e740cb21..9a916a663a64c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12689,6 +12689,20 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { return true; } +/// Test whether the specified input (0 or 1) is a broadcast/splat blended by +/// the given mask. +/// +static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef Mask, + int BroadcastableElement = 0) { + assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] / Size == Input && + Mask[i] % Size != BroadcastableElement) + return false; + return true; +} + /// If we are extracting two 128-bit halves of a vector and shuffling the /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a /// multi-shuffle lowering. @@ -16190,6 +16204,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask); bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask); + bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask); + bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask); // If we have lane crossing shuffles AND they don't all come from the lower // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). @@ -16198,7 +16214,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef Mask, if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) && !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) && (V1.getOpcode() != ISD::BUILD_VECTOR) && - (V2.getOpcode() != ISD::BUILD_VECTOR)) + (V2.getOpcode() != ISD::BUILD_VECTOR) && + (!Subtarget.hasAVX2() || + !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat)))) return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG); // If we have one input in place, then we can permute the other input and diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll index 1baaab0931cb9..26a88ab15e3cc 100644 --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -151,8 +151,8 @@ define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_un define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind { ; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1 +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; CHECK-NEXT: retq %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> ret <4 x double> %r diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 5fe1e2996ee9b..e2cc3ae0dca0a 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -256,11 +256,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -277,11 +277,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index bdc1ff4c157e4..a38ca339cd5e1 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -659,57 +659,57 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm4, %xmm3, %xmm10 -; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0 +; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm10 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm3, %xmm1, %xmm4 +; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-NEXT: vmulpd %xmm7, %xmm6, %xmm10 -; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0 +; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4 ; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm9 -; AVX2-NEXT: vmulsd %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vmulsd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vaddsd %xmm3, %xmm9, %xmm3 ; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7 -; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9 +; AVX2-NEXT: vaddsd %xmm7, %xmm3, %xmm3 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm4, %xmm0, %xmm7 +; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm10 +; AVX2-NEXT: vaddpd %xmm7, %xmm10, %xmm7 ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm3, %xmm10, %xmm11 -; AVX2-NEXT: vaddpd %xmm11, %xmm9, %xmm9 -; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm6, %xmm11, %xmm12 -; AVX2-NEXT: vaddpd %xmm12, %xmm9, %xmm9 -; AVX2-NEXT: vmulsd %xmm7, %xmm2, %xmm7 -; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm10 -; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7 -; AVX2-NEXT: vmulsd %xmm11, %xmm8, %xmm10 -; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7 +; AVX2-NEXT: vmulpd %xmm6, %xmm10, %xmm11 +; AVX2-NEXT: vaddpd %xmm7, %xmm11, %xmm7 +; AVX2-NEXT: vmulsd %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vmulsd %xmm5, %xmm9, %xmm9 +; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vmulsd %xmm10, %xmm8, %xmm9 +; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm0 ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] ; AVX2-NEXT: vmulpd %xmm1, %xmm10, %xmm1 -; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm3, %xmm11, %xmm3 -; AVX2-NEXT: vaddpd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX2-NEXT: vmulpd %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vaddpd %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vmulsd %xmm2, %xmm10, %xmm2 -; AVX2-NEXT: vmulsd %xmm5, %xmm11, %xmm5 +; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm1, %xmm6, %xmm6 +; AVX2-NEXT: vaddpd %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm2 +; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm5 ; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3 -; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] -; AVX2-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX2-NEXT: vmovapd %ymm1, 32(%rdi) -; AVX2-NEXT: vmovapd %ymm0, (%rdi) +; AVX2-NEXT: vmulsd %xmm1, %xmm8, %xmm1 +; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vbroadcastsd %xmm7, %ymm2 +; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm3 +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3] +; AVX2-NEXT: vmovsd %xmm1, 64(%rdi) +; AVX2-NEXT: vmovapd %ymm0, 32(%rdi) +; AVX2-NEXT: vmovapd %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 79602a18693db..00af58544e25c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -493,11 +493,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1] ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; X86-AVX2-NEXT: vmovapd %ymm3, (%edx) ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3] @@ -520,13 +520,13 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9] ; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 -; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9] -; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6 -; X86-AVX512-NEXT: vmovapd %ymm6, (%edx) +; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1] +; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4 +; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] +; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] +; X86-AVX512-NEXT: vmovapd %ymm4, (%edx) ; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1] ; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 ; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx) @@ -563,11 +563,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; ; X64-AVX2-LABEL: PR48908: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1] ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi) ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3] @@ -587,16 +587,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,2,8,9] -; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4 -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9] -; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6 -; X64-AVX512-NEXT: vmovapd %ymm6, (%rdi) -; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,10,1] -; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3 -; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi) +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9] +; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3 +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1] +; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4 +; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1] +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3] +; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi) +; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1] +; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 +; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi) ; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,11] ; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 ; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3] From 3d140004c70e2bc79416825e43207e8b711c56d9 Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Sun, 9 Feb 2025 18:16:51 +0100 Subject: [PATCH 101/293] [ValueTracking] Test for not in dominating condition. (NFC) --- .../InstCombine/fpclass-from-dom-cond.ll | 29 +++++++++++++++++ .../test/Transforms/InstCombine/known-bits.ll | 31 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll index 78329faf34172..e6df7fab356b4 100644 --- a/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll +++ b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll @@ -518,3 +518,32 @@ if.else: if.end: ret i1 false } + +define i1 @test_inv_and(float %x, i1 %cond2) { +; CHECK-LABEL: define i1 @test_inv_and( +; CHECK-SAME: float [[X:%.*]], i1 [[COND2:%.*]]) { +; CHECK-NEXT: [[COND:%.*]] = fcmp oge float [[X]], -1.000000e+00 +; CHECK-NEXT: call void @use(i1 [[COND]]) +; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[COND]], true +; CHECK-NEXT: [[AND:%.*]] = and i1 [[COND2]], [[NOT]] +; CHECK-NEXT: br i1 [[AND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[RET1:%.*]] = fcmp oeq float [[X]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[RET1]] +; CHECK: if.else: +; CHECK-NEXT: ret i1 false +; + %cond = fcmp oge float %x, -1.0 + %neg = fneg float %x + call void @use(i1 %cond) + %not = xor i1 %cond, true + %and = and i1 %not, %cond2 + br i1 %and, label %if.then, label %if.else +if.then: + %ret1 = fcmp oeq float %neg, 0xFFF0000000000000 + ret i1 %ret1 +if.else: + ret i1 false +} + +declare void @use(i1) diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index 5b36684d7149b..7563a63f607f0 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -2365,6 +2365,37 @@ exit: ret i8 %or2 } +define i8 @test_inv_cond_and(i8 %x, i1 %c) { +; CHECK-LABEL: @test_inv_cond_and( +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], 3 +; CHECK-NEXT: [[CMP:%.*]] = icmp ne i8 [[AND]], 0 +; CHECK-NEXT: call void @use(i1 [[CMP]]) +; CHECK-NEXT: [[NOT:%.*]] = xor i1 [[CMP]], true +; CHECK-NEXT: [[COND:%.*]] = and i1 [[C:%.*]], [[NOT]] +; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] +; CHECK: if: +; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -4 +; CHECK-NEXT: ret i8 [[OR1]] +; CHECK: exit: +; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -4 +; CHECK-NEXT: ret i8 [[OR2]] +; + %and = and i8 %x, 3 + %cmp = icmp ne i8 %and, 0 + call void @use(i1 %cmp) + %not = xor i1 %cmp, true + %cond = and i1 %not, %c + br i1 %cond, label %if, label %exit + +if: + %or1 = or i8 %x, -4 + ret i8 %or1 + +exit: + %or2 = or i8 %x, -4 + ret i8 %or2 +} + declare void @dummy() declare void @use(i1) declare void @sink(i8) From 472220077383b2dbd9cfcaffcc6030558ba7a744 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Sun, 9 Feb 2025 11:16:18 -0800 Subject: [PATCH 102/293] [lldb] Merge TestSBCommandReturnObject tests In #125132, Michael pointed out that there are now two tests with the same name: ./lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py ./lldb/test/API/python_api/commandreturnobject/TestSBCommandReturnObject.py --- .../TestSBCommandReturnObject.py | 9 +++++++++ .../TestSBCommandReturnObject.py | 17 ----------------- 2 files changed, 9 insertions(+), 17 deletions(-) delete mode 100644 lldb/test/API/python_api/commandreturnobject/TestSBCommandReturnObject.py diff --git a/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py b/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py index f2dbbbd7b4d42..2193b7270d0b4 100644 --- a/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py +++ b/lldb/test/API/api/command-return-object/TestSBCommandReturnObject.py @@ -24,3 +24,12 @@ def test_sb_command_return_object(self): # return exit code 0 to indicate success. We can let this exception go # - the test harness will recognize it as a test failure. subprocess.check_call([self.driver_exe, self.driver_exe]) + + def test_get_command(self): + res = lldb.SBCommandReturnObject() + self.assertEqual(res.GetCommand(), "") + + ci = self.dbg.GetCommandInterpreter() + ci.HandleCommand("help help", res) + self.assertTrue(res.Succeeded()) + self.assertEqual(res.GetCommand(), "help help") diff --git a/lldb/test/API/python_api/commandreturnobject/TestSBCommandReturnObject.py b/lldb/test/API/python_api/commandreturnobject/TestSBCommandReturnObject.py deleted file mode 100644 index b0d0b7a8dfe4e..0000000000000 --- a/lldb/test/API/python_api/commandreturnobject/TestSBCommandReturnObject.py +++ /dev/null @@ -1,17 +0,0 @@ -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - - -class SBCommandReturnObjectTest(TestBase): - NO_DEBUG_INFO_TESTCASE = True - - def test(self): - res = lldb.SBCommandReturnObject() - self.assertEqual(res.GetCommand(), "") - - ci = self.dbg.GetCommandInterpreter() - ci.HandleCommand("help help", res) - self.assertTrue(res.Succeeded()) - self.assertEqual(res.GetCommand(), "help help") From e9a20f77ee2117b4a6eb40826b7280e29ad29e1e Mon Sep 17 00:00:00 2001 From: Hassnaa Hamdi Date: Sun, 9 Feb 2025 19:21:54 +0000 Subject: [PATCH 103/293] Reland "[LV]: Teach LV to recursively (de)interleave." (#125094) This patch relands the changes from "[LV]: Teach LV to recursively (de)interleave.#122989" Reason for revert: - The patch exposed an assert in the vectorizer related to VF difference between legacy cost model and VPlan-based cost model because of uncalculated cost for VPInstruction which is created by VPlanTransforms as a replacement to 'or disjoint' instruction. VPlanTransforms do that instructions change when there are memory interleaving and predicated blocks, but that change didn't cause problems because at most cases the cost difference between legacy/new models is not noticeable. - Issue is fixed by #125434 Original patch: https://github.com/llvm/llvm-project/pull/89018 Reviewed-by: paulwalker-arm, Mel-Chen --- .../Transforms/Vectorize/LoopVectorize.cpp | 14 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 79 +- .../AArch64/sve-interleaved-accesses.ll | 260 +++- .../sve-interleaved-masked-accesses.ll | 252 ++++ .../RISCV/interleaved-accesses.ll | 1318 +++++++++-------- .../AArch64/sve-interleave-vectorization.ll | 135 ++ 6 files changed, 1387 insertions(+), 671 deletions(-) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4c0bed6fedf5b..c2d347cf9b7e0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3439,10 +3439,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; - // We currently only know how to emit interleave/deinterleave with - // Factor=2 for scalable vectors. This is purely an implementation - // limit. - if (VF.isScalable() && InterleaveFactor != 2) + // For scalable vectors, the only interleave factor currently supported + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor)) return false; // If the group involves a non-integral pointer, we may not be able to @@ -9311,9 +9311,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported - // is 2 since we require the (de)interleave2 intrinsics instead of - // shufflevectors. - assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) && "Unsupported interleave factor for scalable vectors"); return Result; }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 08d7338da87bc..618c8aef38408 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2915,10 +2915,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, - Vals, - /*FMFSource=*/nullptr, Name); + assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " + "scalable vectors, must be power of 2"); + SmallVector InterleavingValues(Vals); + // When interleaving, the number of values will be shrunk until we have the + // single final interleaved value. + auto *InterleaveTy = cast(InterleavingValues[0]->getType()); + for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { + InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); + for (unsigned I = 0; I < Midpoint; ++I) + InterleavingValues[I] = Builder.CreateIntrinsic( + InterleaveTy, Intrinsic::vector_interleave2, + {InterleavingValues[I], InterleavingValues[Midpoint + I]}, + /*FMFSource=*/nullptr, Name); + } + return InterleavingValues[0]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -3004,15 +3015,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); - SmallVector Ops = {ResBlockInMask, ResBlockInMask}; - auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), - State.VF.getKnownMinValue() * 2, true); - return State.Builder.CreateIntrinsic( - MaskTy, Intrinsic::vector_interleave2, Ops, - /*FMFSource=*/nullptr, "interleaved.mask"); + SmallVector Ops(InterleaveFactor, ResBlockInMask); + return interleaveVectors(State.Builder, Ops, "interleaved.mask"); } if (!BlockInMask) @@ -3052,22 +3059,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { ArrayRef VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoad, - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + SmallVector DeinterleavedValues(InterleaveFactor); + DeinterleavedValues[0] = NewLoad; + // For the case of InterleaveFactor > 2, we will have to do recursive + // deinterleaving, because the current available deinterleave intrinsic + // supports only Factor of 2, otherwise it will bailout after first + // iteration. + // When deinterleaving, the number of values will double until we + // have "InterleaveFactor". + for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; + NumVectors *= 2) { + // Deinterleave the elements within the vector + SmallVector TempDeinterleavedValues(NumVectors); + for (unsigned I = 0; I < NumVectors; ++I) { + auto *DiTy = DeinterleavedValues[I]->getType(); + TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], + /*FMFSource=*/nullptr, "strided.vec"); + } + // Extract the deinterleaved values: + for (unsigned I = 0; I < 2; ++I) + for (unsigned J = 0; J < NumVectors; ++J) + DeinterleavedValues[NumVectors * I + J] = + State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); + } - if (!Member) +#ifndef NDEBUG + for (Value *Val : DeinterleavedValues) + assert(Val && "NULL Deinterleaved Value"); +#endif + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + Value *StridedVec = DeinterleavedValues[I]; + if (!Member) { + // This value is not needed as it's not used + cast(StridedVec)->eraseFromParent(); continue; - - Value *StridedVec = State.Builder.CreateExtractValue(DI, I); + } // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index d91b467a44865..eb805b1f8c571 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] @@ -1548,5 +1548,263 @@ end: ret void } +; Check vectorization on an interleaved load/store groups of factor 4 + +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } +%struct.xyzt = type { i32, i32, i32, i32 } + +define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP7]]) +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load , ptr [[TMP13]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC8]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 1 +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP14]]) +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP20]], [[TMP23]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP22]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +; Check vectorization on a reverse interleaved load/store groups of factor 4 + +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; +; int b = A[i].y - i; +; int c = A[i].z * i; +; int d = A[i].t << i; +; B[i].x = a; +; B[i].y = b; +; B[i].z = c; +; B[i].t = d; +; } + +define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{ +; CHECK-LABEL: @interleave_deinterleave_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub splat (i32 1023), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP11]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) +; CHECK-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) +; CHECK-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw [[REVERSE3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; CHECK-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; CHECK-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) +; CHECK-NEXT: [[REVERSE9:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE6]], [[REVERSE8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE7]], [[REVERSE9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC10]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]] +; +entry: + br label %for.body +for.cond.cleanup: ; preds = %for.body + ret void +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0 + %load1 = load i32, ptr %x, align 4 + %trunc = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %load1, %trunc + %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1 + %load2 = load i32, ptr %y, align 4 + %sub = sub nsw i32 %load2, %trunc + %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2 + %load3 = load i32, ptr %z, align 4 + %mul = mul nsw i32 %load3, %trunc + %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3 + %load4 = load i32, ptr %t, align 4 + %shl = shl nuw nsw i32 %load4, %trunc + %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0 + store i32 %add, ptr %x5, align 4 + %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1 + store i32 %sub, ptr %y8, align 4 + %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2 + store i32 %mul, ptr %z5, align 4 + %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3 + store i32 %shl, ptr %t8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +} attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 34141bd8f8197..a2da67c2cbb81 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -529,3 +529,255 @@ for.inc: for.end: ret void } + +; Expected to contain interleave2/deinterleave2 instructions +; +; void masked_strided_factor4(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left1 = p[4*ix]; +; char right1 = p[4*ix + 1]; +; char left2 = p[4*ix + 2]; +; char right2 = p[4*ix + 3]; +; char max1 = max(left1, right1); +; char max2 = max(left2, right2); +; q[4*ix] = max1; +; q[4*ix + 1] = 0 - max1; +; q[4*ix + 2] = max2; +; q[4*ix + 3] = 0 - max2; +; } +; } +;} +define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; SCALAR_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SCALAR_TAIL_FOLDING-NEXT: entry: +; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: vector.body: +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALAR_TAIL_FOLDING: middle.block: +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING: scalar.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: for.body: +; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: if.then: +; SCALAR_TAIL_FOLDING-NEXT: [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]] +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: for.inc: +; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALAR_TAIL_FOLDING: for.end: +; SCALAR_TAIL_FOLDING-NEXT: ret void +; +; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; PREDICATED_TAIL_FOLDING-NEXT: entry: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_TAIL_FOLDING: vector.ph: +; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: vector.body: +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; PREDICATED_TAIL_FOLDING: middle.block: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING: scalar.ph: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: for.body: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; PREDICATED_TAIL_FOLDING: if.then: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; PREDICATED_TAIL_FOLDING: for.inc: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PREDICATED_TAIL_FOLDING: for.end: +; PREDICATED_TAIL_FOLDING-NEXT: ret void +; +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %idx0 = shl nuw nsw i32 %ix.024, 2 + %idx1 = add i32 %idx0, 1 + %idx2 = add i32 %idx0, 2 + %idx3 = add i32 %idx0, 3 + + %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0 + %0 = load i8, ptr %array1idx0, align 1 + %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1 + %1 = load i8, ptr %array1idx1, align 1 + %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2 + %2 = load i8, ptr %array1idx2, align 1 + %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3 + %3 = load i8, ptr %array1idx3, align 1 + + %cmp.i1 = icmp slt i8 %0, %1 + %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0 + %sub1 = sub i8 0, %spec.select.i1 + %cmp.i2 = icmp slt i8 %2, %3 + %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2 + %sub2 = sub i8 0, %spec.select.i2 + + %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0 + store i8 %spec.select.i1, ptr %array3idx0, align 1 + %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1 + store i8 %sub1, ptr %array3idx1, align 1 + %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2 + store i8 %spec.select.i2, ptr %array3idx2, align 1 + %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3 + store i8 %sub2, ptr %array3idx3, align 1 + + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index bda4839dead51..b1ff589fe51bf 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; FIXED-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> -; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> +; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; FIXED-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; SCALABLE-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -360,42 +360,42 @@ exit: define void @load_store_factor3_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; FIXED-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; SCALABLE-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -550,42 +550,42 @@ exit: define void @load_store_factor3_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; FIXED-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; SCALABLE-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -740,56 +740,75 @@ exit: define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; CHECK-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; CHECK-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; CHECK-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; CHECK-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -824,23 +843,23 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; CHECK-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor8( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> @@ -849,39 +868,39 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; FIXED-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; FIXED-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; FIXED-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; FIXED-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; FIXED-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; FIXED-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; FIXED-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; FIXED-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; FIXED-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; FIXED-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; FIXED-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FIXED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) +; FIXED-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) +; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) +; FIXED-NEXT: [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) +; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) +; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> +; FIXED-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> +; FIXED-NEXT: [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> +; FIXED-NEXT: [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> +; FIXED-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> +; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 2 +; FIXED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; FIXED-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -916,64 +935,83 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; FIXED-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; FIXED-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor8( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; SCALABLE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; SCALABLE-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; SCALABLE-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; SCALABLE-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; SCALABLE-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; SCALABLE-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; SCALABLE-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; SCALABLE-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; SCALABLE-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; SCALABLE-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; SCALABLE-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; SCALABLE-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; SCALABLE-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -1008,9 +1046,9 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; SCALABLE-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; SCALABLE-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1080,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1088,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 8 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 -; FIXED-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 -; FIXED-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 +; FIXED-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4 +; FIXED-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 16 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1184,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1192,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1263,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1271,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 4 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4 -; FIXED-NEXT: store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4 +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1367,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1375,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll new file mode 100644 index 0000000000000..362ec22600f92 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +%struct.xyzt = type { i32, i32, i32, i32 } +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } + +define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN9:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP13]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN9]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN9]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN9]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[LDN9]], 3 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], splat (i1 true), ptr [[TMP21]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds %struct.xyzt, ptr %a, i64 %iv + %a.0 = load i32, ptr %gep.a, align 4 + %gep.b = getelementptr inbounds %struct.xyzt, ptr %b, i64 %iv + %b.0 = load i32, ptr %gep.b, align 4 + %add = add nsw i32 %b.0, %a.0 + %gep.dst = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %iv + store i32 %add, ptr %gep.dst, align 4 + %gep.a.1 = getelementptr inbounds nuw i8, ptr %gep.a, i64 4 + %a.1 = load i32, ptr %gep.a.1, align 4 + %gep.b.1 = getelementptr inbounds nuw i8, ptr %gep.b, i64 4 + %b.1 = load i32, ptr %gep.b.1, align 4 + %sub = sub nsw i32 %a.1, %b.1 + %gep.dst.1 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 4 + store i32 %sub, ptr %gep.dst.1, align 4 + %gep.a.2 = getelementptr inbounds nuw i8, ptr %gep.a, i64 8 + %a.2 = load i32, ptr %gep.a.2, align 4 + %gep.b.2 = getelementptr inbounds nuw i8, ptr %gep.b, i64 8 + %b.2 = load i32, ptr %gep.b.2, align 4 + %shl = shl i32 %a.2, %b.2 + %gep.dst.2 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 8 + store i32 %shl, ptr %gep.dst.2, align 4 + %gep.a.3 = getelementptr inbounds nuw i8, ptr %gep.a, i64 12 + %a.3 = load i32, ptr %gep.a.3, align 4 + %gep.b.3 = getelementptr inbounds nuw i8, ptr %gep.b, i64 12 + %b.3 = load i32, ptr %gep.b.3, align 4 + %shr = ashr i32 %a.3, %b.3 + %gep.dst.3 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 12 + store i32 %shr, ptr %gep.dst.3, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} From d2047242e6d0f0deb7634ff22ab164354c520c79 Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Sun, 9 Feb 2025 12:18:52 -0800 Subject: [PATCH 104/293] [CSKY] Default to unsigned char This matches the ABI document found at https://github.com/c-sky/csky-doc/blob/master/C-SKY_V2_CPU_Applications_Binary_Interface_Standards_Manual.pdf Partially addresses https://github.com/llvm/llvm-project/issues/115957 Reviewed By: zixuan-wu Pull Request: https://github.com/llvm/llvm-project/pull/115961 --- clang/lib/Driver/ToolChains/Clang.cpp | 1 + clang/test/Driver/csky-toolchain.c | 1 + 2 files changed, 2 insertions(+) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 821407687ffa1..fe879e8f8bd27 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1387,6 +1387,7 @@ static bool isSignedCharDefault(const llvm::Triple &Triple) { return true; return false; + case llvm::Triple::csky: case llvm::Triple::hexagon: case llvm::Triple::msp430: case llvm::Triple::ppcle: diff --git a/clang/test/Driver/csky-toolchain.c b/clang/test/Driver/csky-toolchain.c index 66485464652ac..638ce64ec98cd 100644 --- a/clang/test/Driver/csky-toolchain.c +++ b/clang/test/Driver/csky-toolchain.c @@ -3,6 +3,7 @@ // RUN: %clang -### %s --target=csky 2>&1 | FileCheck -check-prefix=CC1 %s // CC1: "-cc1" "-triple" "csky" +// CC1: "-fno-signed-char" // In the below tests, --rtlib=platform is used so that the driver ignores // the configure-time CLANG_DEFAULT_RTLIB option when choosing the runtime lib From 3ce96b9ee961e0dc1f27fbb96339c6253f0196bc Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 13:32:37 -0800 Subject: [PATCH 105/293] [Analysis] Avoid repeated hash lookups (NFC) (#126402) --- llvm/lib/Analysis/DependenceGraphBuilder.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Analysis/DependenceGraphBuilder.cpp b/llvm/lib/Analysis/DependenceGraphBuilder.cpp index 7ee2adf49ebb4..c076e52ce6e14 100644 --- a/llvm/lib/Analysis/DependenceGraphBuilder.cpp +++ b/llvm/lib/Analysis/DependenceGraphBuilder.cpp @@ -240,9 +240,7 @@ template void AbstractDependenceGraphBuilder::createDefUseEdges() { Instruction *UI = dyn_cast(U); if (!UI) continue; - NodeType *DstNode = nullptr; - if (IMap.find(UI) != IMap.end()) - DstNode = IMap.find(UI)->second; + NodeType *DstNode = IMap.lookup(UI); // In the case of loops, the scope of the subgraph is all the // basic blocks (and instructions within them) belonging to the loop. We From f6f052625e77632bb672c5ea40d414f0f33fd5b1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 13:33:03 -0800 Subject: [PATCH 106/293] [Sema] Avoid repeated hash lookups (NFC) (#126428) --- clang/lib/Sema/JumpDiagnostics.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp index 4b92d67e49d7d..ffbb9bc0bfe7c 100644 --- a/clang/lib/Sema/JumpDiagnostics.cpp +++ b/clang/lib/Sema/JumpDiagnostics.cpp @@ -786,8 +786,7 @@ void JumpScopeChecker::VerifyIndirectJumps() { if (CHECK_PERMISSIVE(!LabelAndGotoScopes.count(TheLabel->getStmt()))) continue; unsigned LabelScope = LabelAndGotoScopes[TheLabel->getStmt()]; - if (!TargetScopes.contains(LabelScope)) - TargetScopes[LabelScope] = TheLabel; + TargetScopes.try_emplace(LabelScope, TheLabel); } // For each target scope, make sure it's trivially reachable from From b48b422c08e85e6afd39aea7341fdf08d07d3e08 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 13:33:46 -0800 Subject: [PATCH 107/293] [Serialization] Avoid repeated hash lookups (NFC) (#126429) --- clang/lib/Serialization/ASTReader.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index b74bd586e74d7..3c64b67503195 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -645,10 +645,11 @@ collectMacroDefinitions(const PreprocessorOptions &PPOpts, // For an #undef'd macro, we only care about the name. if (IsUndef) { - if (MacroNames && !Macros.count(MacroName)) + auto [It, Inserted] = Macros.try_emplace(MacroName); + if (MacroNames && Inserted) MacroNames->push_back(MacroName); - Macros[MacroName] = std::make_pair("", true); + It->second = std::make_pair("", true); continue; } @@ -661,9 +662,10 @@ collectMacroDefinitions(const PreprocessorOptions &PPOpts, MacroBody = MacroBody.substr(0, End); } - if (MacroNames && !Macros.count(MacroName)) + auto [It, Inserted] = Macros.try_emplace(MacroName); + if (MacroNames && Inserted) MacroNames->push_back(MacroName); - Macros[MacroName] = std::make_pair(MacroBody, false); + It->second = std::make_pair(MacroBody, false); } } From aa066e36f8c421a64e098601b226f0ecd85500c5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 13:34:28 -0800 Subject: [PATCH 108/293] [AMDGPU] Avoid repeated hash lookups (NFC) (#126430) --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 28a27ffc13677..69ddb384e1a40 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -839,10 +839,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { return RejectUser(Inst, "mem transfer inst length is non-constant or " "not a multiple of the vector element size"); - if (!TransferInfo.count(TransferInst)) { + if (TransferInfo.try_emplace(TransferInst).second) { DeferredInsts.push_back(Inst); WorkList.push_back(Inst); - TransferInfo[TransferInst] = MemTransferInfo(); } auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * { From d1af9ca9fdb0db6ecea00e58b713e43fc1b9fa1c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 13:34:47 -0800 Subject: [PATCH 109/293] [AsmPrinter] Avoid repeated map lookups (NFC) (#126431) --- llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp index 55a0afcf7a33f..c9efec37b0bc6 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp @@ -414,9 +414,10 @@ static void handleNewDebugValue(InlinedEntity Var, const MachineInstr &DV, dropRegDescribedVar(RegVars, I.first, Var); // Drop all entries that have ended, and mark the new entry as live. + auto &Entries = LiveEntries[Var]; for (auto Index : IndicesToErase) - LiveEntries[Var].erase(Index); - LiveEntries[Var].insert(NewIndex); + Entries.erase(Index); + Entries.insert(NewIndex); } } From df25511f0e13e8292de485c2c4d7b58941c77afb Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 13:35:12 -0800 Subject: [PATCH 110/293] [Coroutines] Avoid repeated hash lookups (NFC) (#126432) --- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index c56d1bf33efd8..4104e4e533e9d 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -752,13 +752,14 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape, dwarf::DW_ATE_unsigned_char)}); for (auto *V : FrameData.getAllDefs()) { - if (!DIVarCache.contains(V)) + auto It = DIVarCache.find(V); + if (It == DIVarCache.end()) continue; auto Index = FrameData.getFieldIndex(V); - NameCache.insert({Index, DIVarCache[V]->getName()}); - TyCache.insert({Index, DIVarCache[V]->getType()}); + NameCache.insert({Index, It->second->getName()}); + TyCache.insert({Index, It->second->getType()}); } // Cache from index to (Align, Offset Pair) From af6c6992cfda195e84cbe8a0710fd3bc02082104 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 9 Feb 2025 13:35:58 -0800 Subject: [PATCH 111/293] [TableGen] Avoid repeated hash lookups (NFC) (#126433) --- llvm/utils/TableGen/Common/CodeGenSchedule.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index e84b4fd77a6c1..8e8b3196c91b0 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -413,9 +413,9 @@ void CodeGenSchedModels::collectSTIPredicates() { for (const Record *R : Records.getAllDerivedDefinitions("STIPredicate")) { const Record *Decl = R->getValueAsDef("Declaration"); - const auto It = Decl2Index.find(Decl); - if (It == Decl2Index.end()) { - Decl2Index[Decl] = STIPredicates.size(); + const auto [It, Inserted] = + Decl2Index.try_emplace(Decl, STIPredicates.size()); + if (Inserted) { STIPredicateFunction Predicate(Decl); Predicate.addDefinition(R); STIPredicates.emplace_back(std::move(Predicate)); From 04e5ea5237da5c49d05cd9499a5f0eb325638cf9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 8 Feb 2025 22:50:13 -0800 Subject: [PATCH 112/293] [TableGen] Remove recursive walk of linked list from ContractNodes. NFC After f9250401ef120a4605ad67bb43d3b25500900498, this function is tail recursive so it was straightforward to convert this to iteratively walk the linkd list. --- llvm/utils/TableGen/DAGISelMatcherOpt.cpp | 448 +++++++++++----------- 1 file changed, 227 insertions(+), 221 deletions(-) diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp index ed062168dbc6e..9028a60027ffe 100644 --- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp @@ -22,275 +22,275 @@ using namespace llvm; /// ContractNodes - Turn multiple matcher node patterns like 'MoveChild+Record' /// into single compound nodes like RecordChild. -static void ContractNodes(std::unique_ptr &MatcherPtr, +static void ContractNodes(std::unique_ptr &InputMatcherPtr, const CodeGenDAGPatterns &CGP) { - // If we reached the end of the chain, we're done. - Matcher *N = MatcherPtr.get(); - if (!N) - return; - - // If we have a scope node, walk down all of the children. - if (ScopeMatcher *Scope = dyn_cast(N)) { - for (unsigned i = 0, e = Scope->getNumChildren(); i != e; ++i) { - std::unique_ptr Child(Scope->takeChild(i)); - ContractNodes(Child, CGP); - Scope->resetChild(i, Child.release()); + std::unique_ptr *MatcherPtr = &InputMatcherPtr; + while (true) { + Matcher *N = MatcherPtr->get(); + + // If we have a scope node, walk down all of the children. + if (auto *Scope = dyn_cast(N)) { + for (unsigned i = 0, e = Scope->getNumChildren(); i != e; ++i) { + std::unique_ptr Child(Scope->takeChild(i)); + ContractNodes(Child, CGP); + Scope->resetChild(i, Child.release()); + } + return; } - return; - } - // If we found a movechild node with a node that comes in a 'foochild' form, - // transform it. - if (MoveChildMatcher *MC = dyn_cast(N)) { - Matcher *New = nullptr; - if (RecordMatcher *RM = dyn_cast(MC->getNext())) - if (MC->getChildNo() < 8) // Only have RecordChild0...7 - New = new RecordChildMatcher(MC->getChildNo(), RM->getWhatFor(), - RM->getResultNo()); - - if (CheckTypeMatcher *CT = dyn_cast(MC->getNext())) - if (MC->getChildNo() < 8 && // Only have CheckChildType0...7 - CT->getResNo() == 0) // CheckChildType checks res #0 - New = new CheckChildTypeMatcher(MC->getChildNo(), CT->getType()); - - if (CheckSameMatcher *CS = dyn_cast(MC->getNext())) - if (MC->getChildNo() < 4) // Only have CheckChildSame0...3 - New = new CheckChildSameMatcher(MC->getChildNo(), CS->getMatchNumber()); - - if (CheckIntegerMatcher *CI = dyn_cast(MC->getNext())) - if (MC->getChildNo() < 5) // Only have CheckChildInteger0...4 - New = new CheckChildIntegerMatcher(MC->getChildNo(), CI->getValue()); - - if (auto *CCC = dyn_cast(MC->getNext())) - if (MC->getChildNo() == 2) // Only have CheckChild2CondCode - New = new CheckChild2CondCodeMatcher(CCC->getCondCodeName()); - - if (New) { - // Insert the new node. - New->setNext(MatcherPtr.release()); - MatcherPtr.reset(New); - // Remove the old one. - MC->setNext(MC->getNext()->takeNext()); - return ContractNodes(MatcherPtr, CGP); + // If we found a movechild node with a node that comes in a 'foochild' form, + // transform it. + if (MoveChildMatcher *MC = dyn_cast(N)) { + Matcher *New = nullptr; + if (RecordMatcher *RM = dyn_cast(MC->getNext())) + if (MC->getChildNo() < 8) // Only have RecordChild0...7 + New = new RecordChildMatcher(MC->getChildNo(), RM->getWhatFor(), + RM->getResultNo()); + + if (CheckTypeMatcher *CT = dyn_cast(MC->getNext())) + if (MC->getChildNo() < 8 && // Only have CheckChildType0...7 + CT->getResNo() == 0) // CheckChildType checks res #0 + New = new CheckChildTypeMatcher(MC->getChildNo(), CT->getType()); + + if (CheckSameMatcher *CS = dyn_cast(MC->getNext())) + if (MC->getChildNo() < 4) // Only have CheckChildSame0...3 + New = + new CheckChildSameMatcher(MC->getChildNo(), CS->getMatchNumber()); + + if (CheckIntegerMatcher *CI = + dyn_cast(MC->getNext())) + if (MC->getChildNo() < 5) // Only have CheckChildInteger0...4 + New = new CheckChildIntegerMatcher(MC->getChildNo(), CI->getValue()); + + if (auto *CCC = dyn_cast(MC->getNext())) + if (MC->getChildNo() == 2) // Only have CheckChild2CondCode + New = new CheckChild2CondCodeMatcher(CCC->getCondCodeName()); + + if (New) { + // Insert the new node. + New->setNext(MatcherPtr->release()); + MatcherPtr->reset(New); + // Remove the old one. + MC->setNext(MC->getNext()->takeNext()); + continue; + } } - } - // Turn MoveParent->MoveChild into MoveSibling. - if (auto *MP = dyn_cast(N)) { - if (auto *MC = dyn_cast(MP->getNext())) { - auto *MS = new MoveSiblingMatcher(MC->getChildNo()); - MS->setNext(MC->takeNext()); - MatcherPtr.reset(MS); - return ContractNodes(MatcherPtr, CGP); + // Turn MoveParent->MoveChild into MoveSibling. + if (auto *MP = dyn_cast(N)) { + if (auto *MC = dyn_cast(MP->getNext())) { + auto *MS = new MoveSiblingMatcher(MC->getChildNo()); + MS->setNext(MC->takeNext()); + MatcherPtr->reset(MS); + continue; + } } - } - // Uncontract MoveSibling if it will help form other child operations. - if (auto *MS = dyn_cast(N)) { - if (auto *RM = dyn_cast(MS->getNext())) { - // Turn MoveSibling->Record->MoveParent into MoveParent->RecordChild. - if (auto *MP = dyn_cast(RM->getNext())) { - if (MS->getSiblingNo() < 8) { // Only have RecordChild0...7 - auto *NewMP = new MoveParentMatcher(); - auto *NewRCM = new RecordChildMatcher( - MS->getSiblingNo(), RM->getWhatFor(), RM->getResultNo()); - NewMP->setNext(NewRCM); - NewRCM->setNext(MP->takeNext()); - MatcherPtr.reset(NewMP); - return ContractNodes(MatcherPtr, CGP); + // Uncontract MoveSibling if it will help form other child operations. + if (auto *MS = dyn_cast(N)) { + if (auto *RM = dyn_cast(MS->getNext())) { + // Turn MoveSibling->Record->MoveParent into MoveParent->RecordChild. + if (auto *MP = dyn_cast(RM->getNext())) { + if (MS->getSiblingNo() < 8) { // Only have RecordChild0...7 + auto *NewMP = new MoveParentMatcher(); + auto *NewRCM = new RecordChildMatcher( + MS->getSiblingNo(), RM->getWhatFor(), RM->getResultNo()); + NewMP->setNext(NewRCM); + NewRCM->setNext(MP->takeNext()); + MatcherPtr->reset(NewMP); + continue; + } + } + + // Turn MoveSibling->Record->CheckType->MoveParent into + // MoveParent->RecordChild->CheckChildType. + if (auto *CT = dyn_cast(RM->getNext())) { + if (auto *MP = dyn_cast(CT->getNext())) { + if (MS->getSiblingNo() < 8 && // Only have CheckChildType0...7 + CT->getResNo() == 0) { // CheckChildType checks res #0 + auto *NewMP = new MoveParentMatcher(); + auto *NewRCM = new RecordChildMatcher( + MS->getSiblingNo(), RM->getWhatFor(), RM->getResultNo()); + auto *NewCCT = + new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); + NewMP->setNext(NewRCM); + NewRCM->setNext(NewCCT); + NewCCT->setNext(MP->takeNext()); + MatcherPtr->reset(NewMP); + continue; + } + } } } - // Turn MoveSibling->Record->CheckType->MoveParent into - // MoveParent->RecordChild->CheckChildType. - if (auto *CT = dyn_cast(RM->getNext())) { + // Turn MoveSibling->CheckType->MoveParent into + // MoveParent->CheckChildType. + if (auto *CT = dyn_cast(MS->getNext())) { if (auto *MP = dyn_cast(CT->getNext())) { if (MS->getSiblingNo() < 8 && // Only have CheckChildType0...7 CT->getResNo() == 0) { // CheckChildType checks res #0 auto *NewMP = new MoveParentMatcher(); - auto *NewRCM = new RecordChildMatcher( - MS->getSiblingNo(), RM->getWhatFor(), RM->getResultNo()); auto *NewCCT = new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); - NewMP->setNext(NewRCM); - NewRCM->setNext(NewCCT); + NewMP->setNext(NewCCT); NewCCT->setNext(MP->takeNext()); - MatcherPtr.reset(NewMP); - return ContractNodes(MatcherPtr, CGP); + MatcherPtr->reset(NewMP); + continue; } } } - } - - // Turn MoveSibling->CheckType->MoveParent into MoveParent->CheckChildType. - if (auto *CT = dyn_cast(MS->getNext())) { - if (auto *MP = dyn_cast(CT->getNext())) { - if (MS->getSiblingNo() < 8 && // Only have CheckChildType0...7 - CT->getResNo() == 0) { // CheckChildType checks res #0 - auto *NewMP = new MoveParentMatcher(); - auto *NewCCT = - new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); - NewMP->setNext(NewCCT); - NewCCT->setNext(MP->takeNext()); - MatcherPtr.reset(NewMP); - return ContractNodes(MatcherPtr, CGP); - } - } - } - - // Turn MoveSibling->CheckInteger->MoveParent into - // MoveParent->CheckChildInteger. - if (auto *CI = dyn_cast(MS->getNext())) { - if (auto *MP = dyn_cast(CI->getNext())) { - if (MS->getSiblingNo() < 5) { // Only have CheckChildInteger0...4 - auto *NewMP = new MoveParentMatcher(); - auto *NewCCI = - new CheckChildIntegerMatcher(MS->getSiblingNo(), CI->getValue()); - NewMP->setNext(NewCCI); - NewCCI->setNext(MP->takeNext()); - MatcherPtr.reset(NewMP); - return ContractNodes(MatcherPtr, CGP); - } - } - // Turn MoveSibling->CheckInteger->CheckType->MoveParent into - // MoveParent->CheckChildInteger->CheckType. - if (auto *CT = dyn_cast(CI->getNext())) { - if (auto *MP = dyn_cast(CT->getNext())) { - if (MS->getSiblingNo() < 5 && // Only have CheckChildInteger0...4 - CT->getResNo() == 0) { // CheckChildType checks res #0 + // Turn MoveSibling->CheckInteger->MoveParent into + // MoveParent->CheckChildInteger. + if (auto *CI = dyn_cast(MS->getNext())) { + if (auto *MP = dyn_cast(CI->getNext())) { + if (MS->getSiblingNo() < 5) { // Only have CheckChildInteger0...4 auto *NewMP = new MoveParentMatcher(); auto *NewCCI = new CheckChildIntegerMatcher(MS->getSiblingNo(), CI->getValue()); - auto *NewCCT = - new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); NewMP->setNext(NewCCI); - NewCCI->setNext(NewCCT); - NewCCT->setNext(MP->takeNext()); - MatcherPtr.reset(NewMP); - return ContractNodes(MatcherPtr, CGP); + NewCCI->setNext(MP->takeNext()); + MatcherPtr->reset(NewMP); + continue; } } - } - } - // Turn MoveSibling->CheckCondCode->MoveParent into - // MoveParent->CheckChild2CondCode. - if (auto *CCC = dyn_cast(MS->getNext())) { - if (auto *MP = dyn_cast(CCC->getNext())) { - if (MS->getSiblingNo() == 2) { // Only have CheckChild2CondCode - auto *NewMP = new MoveParentMatcher(); - auto *NewCCCC = - new CheckChild2CondCodeMatcher(CCC->getCondCodeName()); - NewMP->setNext(NewCCCC); - NewCCCC->setNext(MP->takeNext()); - MatcherPtr.reset(NewMP); - return ContractNodes(MatcherPtr, CGP); + // Turn MoveSibling->CheckInteger->CheckType->MoveParent into + // MoveParent->CheckChildInteger->CheckType. + if (auto *CT = dyn_cast(CI->getNext())) { + if (auto *MP = dyn_cast(CT->getNext())) { + if (MS->getSiblingNo() < 5 && // Only have CheckChildInteger0...4 + CT->getResNo() == 0) { // CheckChildType checks res #0 + auto *NewMP = new MoveParentMatcher(); + auto *NewCCI = new CheckChildIntegerMatcher(MS->getSiblingNo(), + CI->getValue()); + auto *NewCCT = + new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); + NewMP->setNext(NewCCI); + NewCCI->setNext(NewCCT); + NewCCT->setNext(MP->takeNext()); + MatcherPtr->reset(NewMP); + continue; + } + } } } - } - // Turn MoveSibling->CheckSame->MoveParent into - // MoveParent->CheckChildSame. - if (auto *CS = dyn_cast(MS->getNext())) { - if (auto *MP = dyn_cast(CS->getNext())) { - if (MS->getSiblingNo() < 4) { // Only have CheckChildSame0...3 - auto *NewMP = new MoveParentMatcher(); - auto *NewCCS = new CheckChildSameMatcher(MS->getSiblingNo(), - CS->getMatchNumber()); - NewMP->setNext(NewCCS); - NewCCS->setNext(MP->takeNext()); - MatcherPtr.reset(NewMP); - return ContractNodes(MatcherPtr, CGP); + // Turn MoveSibling->CheckCondCode->MoveParent into + // MoveParent->CheckChild2CondCode. + if (auto *CCC = dyn_cast(MS->getNext())) { + if (auto *MP = dyn_cast(CCC->getNext())) { + if (MS->getSiblingNo() == 2) { // Only have CheckChild2CondCode + auto *NewMP = new MoveParentMatcher(); + auto *NewCCCC = + new CheckChild2CondCodeMatcher(CCC->getCondCodeName()); + NewMP->setNext(NewCCCC); + NewCCCC->setNext(MP->takeNext()); + MatcherPtr->reset(NewMP); + continue; + } } } - // Turn MoveSibling->CheckSame->CheckType->MoveParent into - // MoveParent->CheckChildSame->CheckChildType. - if (auto *CT = dyn_cast(CS->getNext())) { - if (auto *MP = dyn_cast(CT->getNext())) { - if (MS->getSiblingNo() < 4 && // Only have CheckChildSame0...3 - CT->getResNo() == 0) { // CheckChildType checks res #0 + // Turn MoveSibling->CheckSame->MoveParent into + // MoveParent->CheckChildSame. + if (auto *CS = dyn_cast(MS->getNext())) { + if (auto *MP = dyn_cast(CS->getNext())) { + if (MS->getSiblingNo() < 4) { // Only have CheckChildSame0...3 auto *NewMP = new MoveParentMatcher(); auto *NewCCS = new CheckChildSameMatcher(MS->getSiblingNo(), CS->getMatchNumber()); - auto *NewCCT = - new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); NewMP->setNext(NewCCS); - NewCCS->setNext(NewCCT); - NewCCT->setNext(MP->takeNext()); - MatcherPtr.reset(NewMP); - return ContractNodes(MatcherPtr, CGP); + NewCCS->setNext(MP->takeNext()); + MatcherPtr->reset(NewMP); + continue; + } + } + + // Turn MoveSibling->CheckSame->CheckType->MoveParent into + // MoveParent->CheckChildSame->CheckChildType. + if (auto *CT = dyn_cast(CS->getNext())) { + if (auto *MP = dyn_cast(CT->getNext())) { + if (MS->getSiblingNo() < 4 && // Only have CheckChildSame0...3 + CT->getResNo() == 0) { // CheckChildType checks res #0 + auto *NewMP = new MoveParentMatcher(); + auto *NewCCS = new CheckChildSameMatcher(MS->getSiblingNo(), + CS->getMatchNumber()); + auto *NewCCT = + new CheckChildTypeMatcher(MS->getSiblingNo(), CT->getType()); + NewMP->setNext(NewCCS); + NewCCS->setNext(NewCCT); + NewCCT->setNext(MP->takeNext()); + MatcherPtr->reset(NewMP); + continue; + } } } } - } - // Turn MoveSibling->MoveParent into MoveParent. - if (isa(MS->getNext())) { - MatcherPtr.reset(MS->takeNext()); - return ContractNodes(MatcherPtr, CGP); + // Turn MoveSibling->MoveParent into MoveParent. + if (isa(MS->getNext())) { + MatcherPtr->reset(MS->takeNext()); + continue; + } } - } - // Zap movechild -> moveparent. - if (MoveChildMatcher *MC = dyn_cast(N)) - if (MoveParentMatcher *MP = dyn_cast(MC->getNext())) { - MatcherPtr.reset(MP->takeNext()); - return ContractNodes(MatcherPtr, CGP); - } + // Zap movechild -> moveparent. + if (MoveChildMatcher *MC = dyn_cast(N)) + if (MoveParentMatcher *MP = dyn_cast(MC->getNext())) { + MatcherPtr->reset(MP->takeNext()); + continue; + } - // Turn EmitNode->CompleteMatch into MorphNodeTo if we can. - if (EmitNodeMatcher *EN = dyn_cast(N)) - if (CompleteMatchMatcher *CM = - dyn_cast(EN->getNext())) { - // We can only use MorphNodeTo if the result values match up. - unsigned RootResultFirst = EN->getFirstResultSlot(); - bool ResultsMatch = true; - for (unsigned i = 0, e = CM->getNumResults(); i != e; ++i) - if (CM->getResult(i) != RootResultFirst + i) + // Turn EmitNode->CompleteMatch into MorphNodeTo if we can. + if (EmitNodeMatcher *EN = dyn_cast(N)) { + if (CompleteMatchMatcher *CM = + dyn_cast(EN->getNext())) { + // We can only use MorphNodeTo if the result values match up. + unsigned RootResultFirst = EN->getFirstResultSlot(); + bool ResultsMatch = true; + for (unsigned i = 0, e = CM->getNumResults(); i != e; ++i) + if (CM->getResult(i) != RootResultFirst + i) + ResultsMatch = false; + + // If the selected node defines a subset of the glue/chain results, we + // can't use MorphNodeTo. For example, we can't use MorphNodeTo if the + // matched pattern has a chain but the root node doesn't. + const PatternToMatch &Pattern = CM->getPattern(); + + if (!EN->hasChain() && + Pattern.getSrcPattern().NodeHasProperty(SDNPHasChain, CGP)) ResultsMatch = false; - // If the selected node defines a subset of the glue/chain results, we - // can't use MorphNodeTo. For example, we can't use MorphNodeTo if the - // matched pattern has a chain but the root node doesn't. - const PatternToMatch &Pattern = CM->getPattern(); - - if (!EN->hasChain() && - Pattern.getSrcPattern().NodeHasProperty(SDNPHasChain, CGP)) - ResultsMatch = false; - - // If the matched node has glue and the output root doesn't, we can't - // use MorphNodeTo. - // - // NOTE: Strictly speaking, we don't have to check for glue here - // because the code in the pattern generator doesn't handle it right. We - // do it anyway for thoroughness. - if (!EN->hasOutGlue() && - Pattern.getSrcPattern().NodeHasProperty(SDNPOutGlue, CGP)) - ResultsMatch = false; + // If the matched node has glue and the output root doesn't, we can't + // use MorphNodeTo. + // + // NOTE: Strictly speaking, we don't have to check for glue here + // because the code in the pattern generator doesn't handle it right. We + // do it anyway for thoroughness. + if (!EN->hasOutGlue() && + Pattern.getSrcPattern().NodeHasProperty(SDNPOutGlue, CGP)) + ResultsMatch = false; #if 0 - // If the root result node defines more results than the source root node - // *and* has a chain or glue input, then we can't match it because it - // would end up replacing the extra result with the chain/glue. - if ((EN->hasGlue() || EN->hasChain()) && - EN->getNumNonChainGlueVTs() > ... need to get no results reliably ...) - ResultMatch = false; + // If the root result node defines more results than the source root + // node *and* has a chain or glue input, then we can't match it because + // it would end up replacing the extra result with the chain/glue. + if ((EN->hasGlue() || EN->hasChain()) && + EN->getNumNonChainGlueVTs() > ...need to get no results reliably...) + ResultMatch = false; #endif - if (ResultsMatch) { - const SmallVectorImpl &VTs = EN->getVTList(); - const SmallVectorImpl &Operands = EN->getOperandList(); - MatcherPtr.reset(new MorphNodeToMatcher( - EN->getInstruction(), VTs, Operands, EN->hasChain(), - EN->hasInGlue(), EN->hasOutGlue(), EN->hasMemRefs(), - EN->getNumFixedArityOperands(), Pattern)); - return; + if (ResultsMatch) { + const SmallVectorImpl &VTs = EN->getVTList(); + const SmallVectorImpl &Operands = EN->getOperandList(); + MatcherPtr->reset(new MorphNodeToMatcher( + EN->getInstruction(), VTs, Operands, EN->hasChain(), + EN->hasInGlue(), EN->hasOutGlue(), EN->hasMemRefs(), + EN->getNumFixedArityOperands(), Pattern)); + return; + } } - - // FIXME2: Kill off all the SelectionDAG::SelectNodeTo and getMachineNode - // variants. } // If we have a Record node followed by a CheckOpcode, invert the two nodes. @@ -299,18 +299,24 @@ static void ContractNodes(std::unique_ptr &MatcherPtr, // valid on multiple types. if (isa(N) && isa(N->getNext())) { // Unlink the two nodes from the list. - Matcher *CheckType = MatcherPtr.release(); + Matcher *CheckType = MatcherPtr->release(); Matcher *CheckOpcode = CheckType->takeNext(); Matcher *Tail = CheckOpcode->takeNext(); // Relink them. - MatcherPtr.reset(CheckOpcode); + MatcherPtr->reset(CheckOpcode); CheckOpcode->setNext(CheckType); CheckType->setNext(Tail); - return ContractNodes(MatcherPtr, CGP); + continue; } - ContractNodes(N->getNextPtr(), CGP); + // No contractions were performed, go to next node. + MatcherPtr = &(MatcherPtr->get()->getNextPtr()); + + // If we reached the end of the chain, we're done. + if (!MatcherPtr->get()) + return; + } } /// FindNodeWithKind - Scan a series of matchers looking for a matcher with a From 560cea61abc68a9278d0ada26b3e7071e7b97bfe Mon Sep 17 00:00:00 2001 From: Jonathan Thackray Date: Mon, 10 Feb 2025 00:03:05 +0000 Subject: [PATCH 113/293] [NFC][AArch64] move AArch64 non auto-generated tests to static file (#126312) Move AArch64 non auto-generated test code into a static file, since the script `./llvm/test/CodeGen/AArch64/Atomics/generate-tests.py` will overwrite these tests when re-run. (Test code was originally added in change 465bc5e729fd755880b9a288de42a37ad1206301) --- .../Atomics/aarch64-atomic-load-lse2.ll | 112 ----------------- llvm/test/CodeGen/AArch64/v8.4-atomic.ll | 113 ++++++++++++++++++ 2 files changed, 113 insertions(+), 112 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/v8.4-atomic.ll diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll index 3732d4feb0c67..0e9c29758244a 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-lse2.ll @@ -567,118 +567,6 @@ define dso_local i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %pt ret i128 %r } -define dso_local half @load_atomic_f16_aligned_unordered(ptr %ptr) { -; CHECK-LABEL: load_atomic_f16_aligned_unordered: -; CHECK: ldrh w8, [x0] - %r = load atomic half, ptr %ptr unordered, align 2 - ret half %r -} - -define dso_local half @load_atomic_f16_aligned_unordered_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_f16_aligned_unordered_const: -; CHECK: ldrh w8, [x0] - %r = load atomic half, ptr %ptr unordered, align 2 - ret half %r -} - -define dso_local half @load_atomic_f16_aligned_monotonic(ptr %ptr) { -; CHECK-LABEL: load_atomic_f16_aligned_monotonic: -; CHECK: ldrh w8, [x0] - %r = load atomic half, ptr %ptr monotonic, align 2 - ret half %r -} - -define dso_local half @load_atomic_f16_aligned_monotonic_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_f16_aligned_monotonic_const: -; CHECK: ldrh w8, [x0] - %r = load atomic half, ptr %ptr monotonic, align 2 - ret half %r -} - -define dso_local half @load_atomic_f16_aligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_f16_aligned_acquire: -; CHECK: ldarh w8, [x0] - %r = load atomic half, ptr %ptr acquire, align 2 - ret half %r -} - -define dso_local half @load_atomic_f16_aligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_f16_aligned_acquire_const: -; CHECK: ldarh w8, [x0] - %r = load atomic half, ptr %ptr acquire, align 2 - ret half %r -} - -define dso_local half @load_atomic_f16_aligned_seq_cst(ptr %ptr) { -; CHECK-LABEL: load_atomic_f16_aligned_seq_cst: -; CHECK: ldarh w8, [x0] - %r = load atomic half, ptr %ptr seq_cst, align 2 - ret half %r -} - -define dso_local half @load_atomic_f16_aligned_seq_cst_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_f16_aligned_seq_cst_const: -; CHECK: ldarh w8, [x0] - %r = load atomic half, ptr %ptr seq_cst, align 2 - ret half %r -} - -define dso_local bfloat @load_atomic_bf16_aligned_unordered(ptr %ptr) { -; CHECK-LABEL: load_atomic_bf16_aligned_unordered: -; CHECK: ldrh w8, [x0] - %r = load atomic bfloat, ptr %ptr unordered, align 2 - ret bfloat %r -} - -define dso_local bfloat @load_atomic_bf16_aligned_unordered_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_bf16_aligned_unordered_const: -; CHECK: ldrh w8, [x0] - %r = load atomic bfloat, ptr %ptr unordered, align 2 - ret bfloat %r -} - -define dso_local bfloat @load_atomic_bf16_aligned_monotonic(ptr %ptr) { -; CHECK-LABEL: load_atomic_bf16_aligned_monotonic: -; CHECK: ldrh w8, [x0] - %r = load atomic bfloat, ptr %ptr monotonic, align 2 - ret bfloat %r -} - -define dso_local bfloat @load_atomic_bf16_aligned_monotonic_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_bf16_aligned_monotonic_const: -; CHECK: ldrh w8, [x0] - %r = load atomic bfloat, ptr %ptr monotonic, align 2 - ret bfloat %r -} - -define dso_local bfloat @load_atomic_bf16_aligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_bf16_aligned_acquire: -; CHECK: ldarh w8, [x0] - %r = load atomic bfloat, ptr %ptr acquire, align 2 - ret bfloat %r -} - -define dso_local bfloat @load_atomic_bf16_aligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_bf16_aligned_acquire_const: -; CHECK: ldarh w8, [x0] - %r = load atomic bfloat, ptr %ptr acquire, align 2 - ret bfloat %r -} - -define dso_local bfloat @load_atomic_bf16_aligned_seq_cst(ptr %ptr) { -; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst: -; CHECK: ldarh w8, [x0] - %r = load atomic bfloat, ptr %ptr seq_cst, align 2 - ret bfloat %r -} - -define dso_local bfloat @load_atomic_bf16_aligned_seq_cst_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst_const: -; CHECK: ldarh w8, [x0] - %r = load atomic bfloat, ptr %ptr seq_cst, align 2 - ret bfloat %r -} - ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; -O0: {{.*}} ; -O1: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/v8.4-atomic.ll b/llvm/test/CodeGen/AArch64/v8.4-atomic.ll new file mode 100644 index 0000000000000..1394b89159c41 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/v8.4-atomic.ll @@ -0,0 +1,113 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - | FileCheck %s + +define dso_local half @load_atomic_f16_aligned_unordered(ptr %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_unordered: +; CHECK: ldrh w8, [x0] + %r = load atomic half, ptr %ptr unordered, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_unordered_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_unordered_const: +; CHECK: ldrh w8, [x0] + %r = load atomic half, ptr %ptr unordered, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_monotonic(ptr %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_monotonic: +; CHECK: ldrh w8, [x0] + %r = load atomic half, ptr %ptr monotonic, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_monotonic_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_monotonic_const: +; CHECK: ldrh w8, [x0] + %r = load atomic half, ptr %ptr monotonic, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_acquire(ptr %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_acquire: +; CHECK: ldarh w8, [x0] + %r = load atomic half, ptr %ptr acquire, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_acquire_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_acquire_const: +; CHECK: ldarh w8, [x0] + %r = load atomic half, ptr %ptr acquire, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_seq_cst(ptr %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_seq_cst: +; CHECK: ldarh w8, [x0] + %r = load atomic half, ptr %ptr seq_cst, align 2 + ret half %r +} + +define dso_local half @load_atomic_f16_aligned_seq_cst_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_f16_aligned_seq_cst_const: +; CHECK: ldarh w8, [x0] + %r = load atomic half, ptr %ptr seq_cst, align 2 + ret half %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_unordered(ptr %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_unordered: +; CHECK: ldrh w8, [x0] + %r = load atomic bfloat, ptr %ptr unordered, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_unordered_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_unordered_const: +; CHECK: ldrh w8, [x0] + %r = load atomic bfloat, ptr %ptr unordered, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_monotonic(ptr %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_monotonic: +; CHECK: ldrh w8, [x0] + %r = load atomic bfloat, ptr %ptr monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_monotonic_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_monotonic_const: +; CHECK: ldrh w8, [x0] + %r = load atomic bfloat, ptr %ptr monotonic, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_acquire(ptr %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_acquire: +; CHECK: ldarh w8, [x0] + %r = load atomic bfloat, ptr %ptr acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_acquire_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_acquire_const: +; CHECK: ldarh w8, [x0] + %r = load atomic bfloat, ptr %ptr acquire, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_seq_cst(ptr %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst: +; CHECK: ldarh w8, [x0] + %r = load atomic bfloat, ptr %ptr seq_cst, align 2 + ret bfloat %r +} + +define dso_local bfloat @load_atomic_bf16_aligned_seq_cst_const(ptr readonly %ptr) { +; CHECK-LABEL: load_atomic_bf16_aligned_seq_cst_const: +; CHECK: ldarh w8, [x0] + %r = load atomic bfloat, ptr %ptr seq_cst, align 2 + ret bfloat %r +} From 2e3729bf40040ac960153e893d670c58f94eac62 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Mon, 10 Feb 2025 08:40:28 +0800 Subject: [PATCH 114/293] [LV] Prevent query the computeCost() when VF=1 in emitInvalidCostRemarks(). (#117288) We should only query the computeCost() when the VF is vector. --- .../Transforms/Vectorize/LoopVectorize.cpp | 7 ++ .../LoopVectorize/RISCV/remark-reductions.ll | 67 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c2d347cf9b7e0..610e4904a80ad 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4360,6 +4360,13 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( SmallVector InvalidCosts; for (const auto &Plan : VPlans) { for (ElementCount VF : Plan->vectorFactors()) { + // The VPlan-based cost model is designed for computing vector cost. + // Querying VPlan-based cost model with a scarlar VF will cause some + // errors because we expect the VF is vector for most of the widen + // recipes. + if (VF.isScalar()) + continue; + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, CM.CostKind); precomputeCosts(*Plan, VF, CostCtx); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll new file mode 100644 index 0000000000000..e3e727b41c02d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -mtriple=riscv64 -mattr=+v -p loop-vectorize -pass-remarks-analysis=loop-vectorize -S 2>&1 | FileCheck %s + +; CHECK: remark: :0:0: the cost-model indicates that interleaving is not beneficial +define float @s311(float %a_0, float %s311_sum) { +; CHECK-LABEL: define float @s311( +; CHECK-SAME: float [[A_0:%.*]], float [[S311_SUM:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1200, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 1200, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 1200, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[A_0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ [[S311_SUM]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1200, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ [[S311_SUM]], %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED_NEXT]] = fadd float [[A_0]], [[RED]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1200 +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[RED_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi float [ %s311_sum, %entry ], [ %red.next, %loop ] + %red.next = fadd float %a_0, %red + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, 1200 + br i1 %exitcond, label %exit, label %loop + +exit: + %red.lcssa = phi float [ %red.next, %loop ] + ret float %red.lcssa +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. From 967973512b9eba99dd8b04db42dbafcc50d94728 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Sun, 9 Feb 2025 21:48:24 -0500 Subject: [PATCH 115/293] [AMDGPU] Don't unify divergent exit nodes with `musttail` calls (#126395) Fixes SWDEV-512254. --- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 5 +- ...nify-divergent-exit-nodes-with-musttail.ll | 53 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index fda2a38c2464e..d087fbc86545c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -215,7 +215,10 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, PDT.roots(), [&](auto BB) { return !isUniformlyReached(UA, *BB); }); for (BasicBlock *BB : PDT.roots()) { - if (isa(BB->getTerminator())) { + if (auto *RI = dyn_cast(BB->getTerminator())) { + auto *CI = dyn_cast_or_null(RI->getPrevNode()); + if (CI && CI->isMustTailCall()) + continue; if (HasDivergentExitBlock) ReturningBlocks.push_back(BB); } else if (isa(BB->getTerminator())) { diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll new file mode 100644 index 0000000000000..007e3f0a6bdbc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=amdgpu-unify-divergent-exit-nodes -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s -o - | FileCheck %s + +declare void @foo(ptr) +declare i1 @bar(ptr) + +define void @musttail_call_without_return_value(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[P]], align 1 +; CHECK-NEXT: br i1 [[LOAD]], label %[[BB_0:.*]], label %[[BB_1:.*]] +; CHECK: [[BB_0]]: +; CHECK-NEXT: musttail call void @foo(ptr [[P]]) +; CHECK-NEXT: ret void +; CHECK: [[BB_1]]: +; CHECK-NEXT: ret void +; +entry: + %load = load i1, ptr %p, align 1 + br i1 %load, label %bb.0, label %bb.1 + +bb.0: + musttail call void @foo(ptr %p) + ret void + +bb.1: + ret void +} + +define i1 @musttail_call_with_return_value(ptr %p) { +; CHECK-LABEL: define i1 @musttail_call_with_return_value( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i1, ptr [[P]], align 1 +; CHECK-NEXT: br i1 [[LOAD]], label %[[BB_0:.*]], label %[[BB_1:.*]] +; CHECK: [[BB_0]]: +; CHECK-NEXT: [[RET:%.*]] = musttail call i1 @bar(ptr [[P]]) +; CHECK-NEXT: ret i1 [[RET]] +; CHECK: [[BB_1]]: +; CHECK-NEXT: ret i1 [[LOAD]] +; +entry: + %load = load i1, ptr %p, align 1 + br i1 %load, label %bb.0, label %bb.1 + +bb.0: + %ret = musttail call i1 @bar(ptr %p) + ret i1 %ret + +bb.1: + ret i1 %load +} From 161cfc6f39bef8994eb944687033ebd3570196e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Pir=C3=B3g?= Date: Mon, 10 Feb 2025 05:48:02 +0100 Subject: [PATCH 116/293] [AVX10.2] Fix wrong intrinsic names after rename (#126390) In my previous PR (#123656) to update the names of AVX10.2 intrinsics and mnemonics, I have erroneously deleted `_ph` from few intrinsics. This PR corrects this. --- clang/lib/Headers/avx10_2_512convertintrin.h | 6 ++-- clang/lib/Headers/avx10_2convertintrin.h | 17 +++++---- .../CodeGen/X86/avx10_2_512convert-builtins.c | 18 +++++----- .../CodeGen/X86/avx10_2convert-builtins.c | 36 +++++++++---------- 4 files changed, 38 insertions(+), 39 deletions(-) diff --git a/clang/lib/Headers/avx10_2_512convertintrin.h b/clang/lib/Headers/avx10_2_512convertintrin.h index 0b5fca5cda522..516ccc68672d6 100644 --- a/clang/lib/Headers/avx10_2_512convertintrin.h +++ b/clang/lib/Headers/avx10_2_512convertintrin.h @@ -213,19 +213,19 @@ _mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) { (__v64qi)(__m512i)_mm512_setzero_si512()); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8(__m256i __A) { +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvthf8_ph(__m256i __A) { return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( (__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvthf8(__m512h __W, __mmask32 __U, __m256i __A) { +_mm512_mask_cvthf8_ph(__m512h __W, __mmask32 __U, __m256i __A) { return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( (__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvthf8(__mmask32 __U, __m256i __A) { +_mm512_maskz_cvthf8_ph(__mmask32 __U, __m256i __A) { return (__m512h)__builtin_ia32_vcvthf8_2ph512_mask( (__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U); } diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h index c67a5b890f195..c419323910f18 100644 --- a/clang/lib/Headers/avx10_2convertintrin.h +++ b/clang/lib/Headers/avx10_2convertintrin.h @@ -381,37 +381,36 @@ _mm256_maskz_cvts2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) { (__v32qi)(__m256i)_mm256_setzero_si256()); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8(__m128i __A) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8_ph(__m128i __A) { return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( (__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvthf8(__m128h __W, - __mmask8 __U, - __m128i __A) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvthf8_ph(__m128h __W, __mmask8 __U, __m128i __A) { return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( (__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvthf8(__mmask8 __U, - __m128i __A) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvthf8_ph(__mmask8 __U, __m128i __A) { return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask( (__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8(__m128i __A) { +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8_ph(__m128i __A) { return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( (__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvthf8(__m256h __W, __mmask16 __U, __m128i __A) { +_mm256_mask_cvthf8_ph(__m256h __W, __mmask16 __U, __m128i __A) { return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( (__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvthf8(__mmask16 __U, __m128i __A) { +_mm256_maskz_cvthf8_ph(__mmask16 __U, __m128i __A) { return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask( (__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U); } diff --git a/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c b/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c index 22503c640a727..dcf7bbc005a7c 100644 --- a/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2_512convert-builtins.c @@ -201,22 +201,22 @@ __m512i test_mm512_maskz_cvts2ph_hf8(__mmask64 __U, __m512h __A, __m512h __B) { return _mm512_maskz_cvts2ph_hf8(__U, __A, __B); } -__m512h test_mm512_cvthf8(__m256i __A) { - // CHECK-LABEL: @test_mm512_cvthf8( +__m512h test_mm512_cvthf8_ph(__m256i __A) { + // CHECK-LABEL: @test_mm512_cvthf8_ph( // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( - return _mm512_cvthf8(__A); + return _mm512_cvthf8_ph(__A); } -__m512h test_mm512_mask_cvthf8(__m512h __A, __mmask32 __B, __m256i __C) { - // CHECK-LABEL: @test_mm512_mask_cvthf8( +__m512h test_mm512_mask_cvthf8_ph(__m512h __A, __mmask32 __B, __m256i __C) { + // CHECK-LABEL: @test_mm512_mask_cvthf8_ph( // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( - return _mm512_mask_cvthf8(__A, __B, __C); + return _mm512_mask_cvthf8_ph(__A, __B, __C); } -__m512h test_mm512_maskz_cvthf8(__mmask32 __A, __m256i __B) { - // CHECK-LABEL: @test_mm512_maskz_cvthf8( +__m512h test_mm512_maskz_cvthf8_ph(__mmask32 __A, __m256i __B) { + // CHECK-LABEL: @test_mm512_maskz_cvthf8_ph( // CHECK: call <32 x half> @llvm.x86.avx10.mask.vcvthf82ph512( - return _mm512_maskz_cvthf8(__A, __B); + return _mm512_maskz_cvthf8_ph(__A, __B); } __m256i test_mm512_cvtph_bf8(__m512h __A) { diff --git a/clang/test/CodeGen/X86/avx10_2convert-builtins.c b/clang/test/CodeGen/X86/avx10_2convert-builtins.c index efd9a31c40875..d73a223922ce0 100644 --- a/clang/test/CodeGen/X86/avx10_2convert-builtins.c +++ b/clang/test/CodeGen/X86/avx10_2convert-builtins.c @@ -379,40 +379,40 @@ __m256i test_mm256_maskz_cvts2ph_hf8(__mmask16 __U, __m256h __A, __m256h __B) { return _mm256_maskz_cvts2ph_hf8(__U, __A, __B); } -__m128h test_mm_cvthf8(__m128i __A) { - // CHECK-LABEL: @test_mm_cvthf8( +__m128h test_mm_cvthf8_ph(__m128i __A) { + // CHECK-LABEL: @test_mm_cvthf8_ph( // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( - return _mm_cvthf8(__A); + return _mm_cvthf8_ph(__A); } -__m128h test_mm_mask_cvthf8(__m128h __A, __mmask8 __B, __m128i __C) { - // CHECK-LABEL: @test_mm_mask_cvthf8( +__m128h test_mm_mask_cvthf8_ph(__m128h __A, __mmask8 __B, __m128i __C) { + // CHECK-LABEL: @test_mm_mask_cvthf8_ph( // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( - return _mm_mask_cvthf8(__A, __B, __C); + return _mm_mask_cvthf8_ph(__A, __B, __C); } -__m128h test_mm_maskz_cvthf8(__mmask8 __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_cvthf8( +__m128h test_mm_maskz_cvthf8_ph(__mmask8 __A, __m128i __B) { + // CHECK-LABEL: @test_mm_maskz_cvthf8_ph( // CHECK: call <8 x half> @llvm.x86.avx10.mask.vcvthf82ph128( - return _mm_maskz_cvthf8(__A, __B); + return _mm_maskz_cvthf8_ph(__A, __B); } -__m256h test_mm256_cvthf8(__m128i __A) { - // CHECK-LABEL: @test_mm256_cvthf8( +__m256h test_mm256_cvthf8_ph(__m128i __A) { + // CHECK-LABEL: @test_mm256_cvthf8_ph( // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( - return _mm256_cvthf8(__A); + return _mm256_cvthf8_ph(__A); } -__m256h test_mm256_mask_cvthf8(__m256h __A, __mmask16 __B, __m128i __C) { - // CHECK-LABEL: @test_mm256_mask_cvthf8( +__m256h test_mm256_mask_cvthf8_ph(__m256h __A, __mmask16 __B, __m128i __C) { + // CHECK-LABEL: @test_mm256_mask_cvthf8_ph( // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( - return _mm256_mask_cvthf8(__A, __B, __C); + return _mm256_mask_cvthf8_ph(__A, __B, __C); } -__m256h test_mm256_maskz_cvthf8(__mmask16 __A, __m128i __B) { - // CHECK-LABEL: @test_mm256_maskz_cvthf8( +__m256h test_mm256_maskz_cvthf8_ph(__mmask16 __A, __m128i __B) { + // CHECK-LABEL: @test_mm256_maskz_cvthf8_ph( // CHECK: call <16 x half> @llvm.x86.avx10.mask.vcvthf82ph256( - return _mm256_maskz_cvthf8(__A, __B); + return _mm256_maskz_cvthf8_ph(__A, __B); } __m128i test_mm_cvtph_bf8(__m128h __A) { From 55632404bd0b6f2b6c09426ed492e9351c9706ed Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 10 Feb 2025 00:06:25 -0500 Subject: [PATCH 117/293] [benchmark] Sync a few commits from upstream to help with CPU count (#126410) Try to use the _SC_NPROCESSORS_ONLN sysconf elsewhere (cherry picked from commit edb1e76d8cb080a396c7c992e5d4023e1a777bd1) Replace usage of deprecated sysctl on macOS (cherry picked from commit faaa266d33ff203e28b31dd31be9f90c29f28d04) Retrieve the number of online CPUs on OpenBSD and NetBSD (cherry picked from commit 41e81b1ca4bbb41d234f2d0f2c56591db78ebb83) Update error message now that /proc/cpuinfo is no longer in use (cherry picked from commit c35af58b61daa111c93924e0e7b65022871fadac) Fix runtime crash when parsing /proc/cpuinfo fails (cherry picked from commit 39be87d3004ff9ff4cdf736651af80c3d15e2497) another reversal of something that breaks on wasm (cherry picked from commit 44507bc91ff9a23ad8ad4120cfc6b0d9bd27e2ca) --- third-party/benchmark/src/sysinfo.cc | 53 ++++++++++++++++------------ 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc index 8283a081ee80b..3993ae17f7fc4 100644 --- a/third-party/benchmark/src/sysinfo.cc +++ b/third-party/benchmark/src/sysinfo.cc @@ -160,11 +160,11 @@ ValueUnion GetSysctlImp(std::string const& name) { int mib[2]; mib[0] = CTL_HW; - if ((name == "hw.ncpu") || (name == "hw.cpuspeed")) { + if ((name == "hw.ncpuonline") || (name == "hw.cpuspeed")) { ValueUnion buff(sizeof(int)); - if (name == "hw.ncpu") { - mib[1] = HW_NCPU; + if (name == "hw.ncpuonline") { + mib[1] = HW_NCPUONLINE; } else { mib[1] = HW_CPUSPEED; } @@ -482,27 +482,14 @@ std::string GetSystemName() { } int GetNumCPUsImpl() { -#ifdef BENCHMARK_HAS_SYSCTL - int num_cpu = -1; - if (GetSysctl("hw.ncpu", &num_cpu)) return num_cpu; - PrintErrorAndDie("Err: ", strerror(errno)); -#elif defined(BENCHMARK_OS_WINDOWS) +#ifdef BENCHMARK_OS_WINDOWS SYSTEM_INFO sysinfo; // Use memset as opposed to = {} to avoid GCC missing initializer false // positives. std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO)); GetSystemInfo(&sysinfo); - return sysinfo.dwNumberOfProcessors; // number of logical - // processors in the current - // group -#elif defined(__linux__) || defined(BENCHMARK_OS_SOLARIS) - // Returns -1 in case of a failure. - int num_cpu = static_cast(sysconf(_SC_NPROCESSORS_ONLN)); - if (num_cpu < 0) { - PrintErrorAndDie("sysconf(_SC_NPROCESSORS_ONLN) failed with error: ", - strerror(errno)); - } - return num_cpu; + // number of logical processors in the current group + return static_cast(sysinfo.dwNumberOfProcessors); #elif defined(BENCHMARK_OS_QNX) return static_cast(_syspage_ptr->num_cpu); #elif defined(BENCHMARK_OS_QURT) @@ -511,16 +498,36 @@ int GetNumCPUsImpl() { hardware_threads.max_hthreads = 1; } return hardware_threads.max_hthreads; +#elif defined(BENCHMARK_HAS_SYSCTL) + int num_cpu = -1; + constexpr auto* hwncpu = +#if defined BENCHMARK_OS_MACOSX + "hw.logicalcpu"; +#elif defined(HW_NCPUONLINE) + "hw.ncpuonline"; +#else + "hw.ncpu"; +#endif + if (GetSysctl(hwncpu, &num_cpu)) return num_cpu; + PrintErrorAndDie("Err: ", strerror(errno)); +#elif defined(_SC_NPROCESSORS_ONLN) + // Returns -1 in case of a failure. + int num_cpu = static_cast(sysconf(_SC_NPROCESSORS_ONLN)); + if (num_cpu < 0) { + PrintErrorAndDie("sysconf(_SC_NPROCESSORS_ONLN) failed with error: ", + strerror(errno)); + } + return num_cpu; #endif BENCHMARK_UNREACHABLE(); } int GetNumCPUs() { - const int num_cpus = GetNumCPUsImpl(); + int num_cpus = GetNumCPUsImpl(); if (num_cpus < 1) { - PrintErrorAndDie( - "Unable to extract number of CPUs. If your platform uses " - "/proc/cpuinfo, custom support may need to be added."); + std::cerr << "Unable to extract number of CPUs.\n"; + /* There is at least one CPU which we run on. */ + num_cpus = 1; } return num_cpus; } From 70fdd9f0a24154b63169c66aff1ddc4507db6034 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 10 Feb 2025 00:11:09 -0500 Subject: [PATCH 118/293] [GlobalISel] Check whether `G_CTLZ` is legal in `matchUMulHToLShr` (#126457) We need to check `G_CTLZ` because the combine uses `G_CTLZ` to get log base 2, and it is not always legal for on a target. Fixes SWDEV-512440. --- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 6 +- ...ctlz-from-umul-to-lshr-in-postlegalizer.ll | 98 +++++++++++++++++++ ...tlz-from-umul-to-lshr-in-postlegalizer.mir | 23 +++++ 3 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/no-ctlz-from-umul-to-lshr-in-postlegalizer.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/no-ctlz-from-umul-to-lshr-in-postlegalizer.mir diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 4648414cc46ae..0dfbb91f2ac54 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5641,6 +5641,7 @@ bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) const { Register RHS = MI.getOperand(2).getReg(); Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); + LLT RHSTy = MRI.getType(RHS); LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); auto MatchPow2ExceptOne = [&](const Constant *C) { if (auto *CI = dyn_cast(C)) @@ -5649,7 +5650,10 @@ bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) const { }; if (!matchUnaryPredicate(MRI, RHS, MatchPow2ExceptOne, false)) return false; - return isLegalOrBeforeLegalizer({TargetOpcode::G_LSHR, {Ty, ShiftAmtTy}}); + // We need to check both G_LSHR and G_CTLZ because the combine uses G_CTLZ to + // get log base 2, and it is not always legal for on a target. + return isLegalOrBeforeLegalizer({TargetOpcode::G_LSHR, {Ty, ShiftAmtTy}}) && + isLegalOrBeforeLegalizer({TargetOpcode::G_CTLZ, {RHSTy, RHSTy}}); } void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) const { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/no-ctlz-from-umul-to-lshr-in-postlegalizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-ctlz-from-umul-to-lshr-in-postlegalizer.ll new file mode 100644 index 0000000000000..c237911319329 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-ctlz-from-umul-to-lshr-in-postlegalizer.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -O0 -global-isel=true %s -o - | FileCheck %s + +define void @test(ptr %p) { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: s_mov_b32 s5, 16 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_cvt_f32_ubyte0_e64 v2, v2 +; CHECK-NEXT: v_rcp_iflag_f32_e64 v2, v2 +; CHECK-NEXT: s_mov_b32 s4, 0x4f7ffffe +; CHECK-NEXT: v_mov_b32_e32 v3, s4 +; CHECK-NEXT: v_mul_f32_e64 v2, v2, v3 +; CHECK-NEXT: v_cvt_u32_f32_e64 v2, v2 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_add_u32_e64 v2, v2, v3 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_mul_hi_u32 v2, v2, v3 +; CHECK-NEXT: s_mov_b32 s7, 2 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: v_add_u32_e64 v2, v2, v3 +; CHECK-NEXT: v_mov_b32_e32 v3, s6 +; CHECK-NEXT: v_cvt_f32_ubyte0_e64 v3, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e64 v3, v3 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: v_mul_f32_e64 v3, v3, v4 +; CHECK-NEXT: v_cvt_u32_f32_e64 v3, v3 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, s7 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_u32_e64 v3, v3, v4 +; CHECK-NEXT: v_mov_b32_e32 v4, s5 +; CHECK-NEXT: v_mul_hi_u32 v3, v3, v4 +; CHECK-NEXT: s_mov_b32 s7, 2 +; CHECK-NEXT: v_mov_b32_e32 v4, s7 +; CHECK-NEXT: v_add_u32_e64 v6, v3, v4 +; CHECK-NEXT: v_mov_b32_e32 v3, s6 +; CHECK-NEXT: v_cvt_f32_ubyte0_e64 v3, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e64 v3, v3 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: v_mul_f32_e64 v3, v3, v4 +; CHECK-NEXT: v_cvt_u32_f32_e64 v3, v3 +; CHECK-NEXT: s_mov_b32 s7, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, s7 +; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_add_u32_e64 v3, v3, v4 +; CHECK-NEXT: v_mov_b32_e32 v4, s5 +; CHECK-NEXT: v_mul_hi_u32 v3, v3, v4 +; CHECK-NEXT: s_mov_b32 s7, 2 +; CHECK-NEXT: v_mov_b32_e32 v4, s7 +; CHECK-NEXT: v_add_u32_e64 v3, v3, v4 +; CHECK-NEXT: v_mov_b32_e32 v4, s6 +; CHECK-NEXT: v_cvt_f32_ubyte0_e64 v4, v4 +; CHECK-NEXT: v_rcp_iflag_f32_e64 v4, v4 +; CHECK-NEXT: v_mov_b32_e32 v5, s4 +; CHECK-NEXT: v_mul_f32_e64 v4, v4, v5 +; CHECK-NEXT: v_cvt_u32_f32_e64 v4, v4 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_mov_b32_e32 v5, s4 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_add_u32_e64 v4, v4, v5 +; CHECK-NEXT: v_mov_b32_e32 v5, s5 +; CHECK-NEXT: v_mul_hi_u32 v4, v4, v5 +; CHECK-NEXT: s_mov_b32 s4, 2 +; CHECK-NEXT: v_mov_b32_e32 v5, s4 +; CHECK-NEXT: v_add_u32_e64 v4, v4, v5 +; CHECK-NEXT: s_mov_b32 s4, 0xff +; CHECK-NEXT: v_mov_b32_e32 v5, s4 +; CHECK-NEXT: v_mov_b32_e32 v7, s4 +; CHECK-NEXT: v_and_b32_e64 v7, v6, v7 +; CHECK-NEXT: s_mov_b32 s6, 8 +; CHECK-NEXT: v_mov_b32_e32 v6, s6 +; CHECK-NEXT: v_lshlrev_b32_e64 v6, v6, v7 +; CHECK-NEXT: v_and_or_b32 v2, v2, v5, v6 +; CHECK-NEXT: v_mov_b32_e32 v5, s4 +; CHECK-NEXT: v_and_b32_e64 v5, v3, v5 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_lshlrev_b32_e64 v3, v3, v5 +; CHECK-NEXT: v_mov_b32_e32 v5, s4 +; CHECK-NEXT: v_and_b32_e64 v5, v4, v5 +; CHECK-NEXT: s_mov_b32 s4, 24 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: v_lshlrev_b32_e64 v4, v4, v5 +; CHECK-NEXT: v_or3_b32 v2, v2, v3, v4 +; CHECK-NEXT: flat_store_dword v[0:1], v2 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %B = udiv <4 x i8> splat (i8 16), zeroinitializer + store <4 x i8> %B, ptr %p, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/no-ctlz-from-umul-to-lshr-in-postlegalizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-ctlz-from-umul-to-lshr-in-postlegalizer.mir new file mode 100644 index 0000000000000..00ead74cb37bb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-ctlz-from-umul-to-lshr-in-postlegalizer.mir @@ -0,0 +1,23 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck %s + +--- +name: test +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: test + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]] + ; CHECK-NEXT: $vgpr0 = COPY [[UMULH]](s32) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 4 + %2:_(s32) = G_UMULH %0:_, %1:_ + $vgpr0 = COPY %2:_(s32) + SI_RETURN implicit $vgpr0 From aebe6c5d7f88a05a29ef6c643482ca7eaf994b19 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Sun, 9 Feb 2025 21:35:32 -0800 Subject: [PATCH 119/293] [RISCV] Improve Errors for X1/X5/X1X5 Reg Classes (#126184) LLVM has functionality for producing a register-class-specific error message in the assembly parser, rather than just emitting the generic "invalid operand for instruction" error. This starts the gradual adoption of this functionality for RISC-V, with some lesser-used shadow-stack register classes: - GPRX1 (only contains `ra`) - GPRX5 (only contains `t0`) - GPRX1X5 (only contains `ra` and `t0`) LLVM is reasonably conservative about when these errors are used, in particular you have to have all the features for the relevant mnemonic enabled before it will do, hence the test updates. This also merges a pair of almost identical rv32/rv64 test files into a single file with one run line. --- llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 17 ++++++++++++++--- llvm/test/MC/RISCV/rv32zicfiss-invalid.s | 17 ----------------- llvm/test/MC/RISCV/rv64zicfiss-invalid.s | 17 ----------------- llvm/test/MC/RISCV/zicfiss-invalid.s | 19 +++++++++++++++++++ 4 files changed, 33 insertions(+), 37 deletions(-) delete mode 100644 llvm/test/MC/RISCV/rv32zicfiss-invalid.s delete mode 100644 llvm/test/MC/RISCV/rv64zicfiss-invalid.s create mode 100644 llvm/test/MC/RISCV/zicfiss-invalid.s diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 7eb93973459c0..e7e7a4b7d035b 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -247,8 +247,16 @@ def GPR : GPRRegisterClass<(add (sequence "X%u", 10, 17), (sequence "X%u", 0, 4))>; def GPRX0 : GPRRegisterClass<(add X0)>; -def GPRX1 : GPRRegisterClass<(add X1)>; -def GPRX5 : GPRRegisterClass<(add X5)>; + +def GPRX1 : GPRRegisterClass<(add X1)> { + let DiagnosticType = "InvalidRegClassGPRX1"; + let DiagnosticString = "register must be ra (x1)"; +} + +def GPRX5 : GPRRegisterClass<(add X5)> { + let DiagnosticType = "InvalidRegClassGPRX5"; + let DiagnosticString = "register must be t0 (x5)"; +} def GPRNoX0 : GPRRegisterClass<(sub GPR, X0)>; @@ -282,7 +290,10 @@ def SP : GPRRegisterClass<(add X2)>; def SR07 : GPRRegisterClass<(add (sequence "X%u", 8, 9), (sequence "X%u", 18, 23))>; -def GPRX1X5 : GPRRegisterClass<(add X1, X5)>; +def GPRX1X5 : GPRRegisterClass<(add X1, X5)> { + let DiagnosticType = "InvalidRegClassGPRX1X5"; + let DiagnosticString = "register must be ra or t0 (x1 or x5)"; +} //===----------------------------------------------------------------------===// // Even-Odd GPR Pairs diff --git a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s b/llvm/test/MC/RISCV/rv32zicfiss-invalid.s deleted file mode 100644 index 048df67e8a646..0000000000000 --- a/llvm/test/MC/RISCV/rv32zicfiss-invalid.s +++ /dev/null @@ -1,17 +0,0 @@ -# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+c -M no-aliases -show-encoding \ -# RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s - -# CHECK-ERR: error: invalid operand for instruction -sspopchk a1 - -# CHECK-ERR: error: invalid operand for instruction -c.sspush t0 - -# CHECK-ERR: error: invalid operand for instruction -c.sspopchk ra - -# CHECK-ERR: error: invalid operand for instruction -sspush a0 - -# CHECK-ERR: error: invalid operand for instruction -ssrdp zero diff --git a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s b/llvm/test/MC/RISCV/rv64zicfiss-invalid.s deleted file mode 100644 index fc69c68a477d6..0000000000000 --- a/llvm/test/MC/RISCV/rv64zicfiss-invalid.s +++ /dev/null @@ -1,17 +0,0 @@ -# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+c -M no-aliases -show-encoding \ -# RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s - -# CHECK-ERR: error: invalid operand for instruction -sspopchk a1 - -# CHECK-ERR: error: invalid operand for instruction -c.sspush t0 - -# CHECK-ERR: error: invalid operand for instruction -c.sspopchk ra - -# CHECK-ERR: error: invalid operand for instruction -sspush a0 - -# CHECK-ERR: error: invalid operand for instruction -ssrdp zero diff --git a/llvm/test/MC/RISCV/zicfiss-invalid.s b/llvm/test/MC/RISCV/zicfiss-invalid.s new file mode 100644 index 0000000000000..a5ab9240f3fad --- /dev/null +++ b/llvm/test/MC/RISCV/zicfiss-invalid.s @@ -0,0 +1,19 @@ +# RUN: not llvm-mc %s -triple=riscv32 -mattr=+experimental-zicfiss,+zcmop,+c -M no-aliases -show-encoding \ +# RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s +# RUN: not llvm-mc %s -triple=riscv64 -mattr=+experimental-zicfiss,+zcmop,+c -M no-aliases -show-encoding \ +# RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s + +# CHECK-ERR: error: register must be ra or t0 (x1 or x5) +sspopchk a1 + +# CHECK-ERR: error: register must be ra (x1) +c.sspush t0 + +# CHECK-ERR: error: register must be t0 (x5) +c.sspopchk ra + +# CHECK-ERR: error: register must be ra or t0 (x1 or x5) +sspush a0 + +# CHECK-ERR: error: invalid operand for instruction +ssrdp zero From 3a66ebae06d72d500c52413b9b189e95762e01b3 Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Mon, 10 Feb 2025 08:55:27 +0100 Subject: [PATCH 120/293] [BoundsSafety][doc] Fix a typo (#126247) --- clang/docs/BoundsSafety.rst | 4 ++-- clang/docs/BoundsSafetyImplPlans.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/docs/BoundsSafety.rst b/clang/docs/BoundsSafety.rst index 8635bec6e17c7..cf5b0c75c0387 100644 --- a/clang/docs/BoundsSafety.rst +++ b/clang/docs/BoundsSafety.rst @@ -777,13 +777,13 @@ the transformed pseudo code of function ``alloc_buf()`` in the example below. size_t count; } sized_buf_t; - void alloc_buf(sized_buf_t *sbuf, sized_t nelems) { + void alloc_buf(sized_buf_t *sbuf, size_t nelems) { sbuf->buf = (int *)malloc(sizeof(int) * nelems); sbuf->count = nelems; } // Transformed pseudo code: - void alloc_buf(sized_buf_t *sbuf, sized_t nelems) { + void alloc_buf(sized_buf_t *sbuf, size_t nelems) { // Materialize RHS values: int *tmp_ptr = (int *)malloc(sizeof(int) * nelems); int tmp_count = nelems; diff --git a/clang/docs/BoundsSafetyImplPlans.rst b/clang/docs/BoundsSafetyImplPlans.rst index 93c2ed7b43402..34276c920f31e 100644 --- a/clang/docs/BoundsSafetyImplPlans.rst +++ b/clang/docs/BoundsSafetyImplPlans.rst @@ -134,7 +134,7 @@ same basic block and without side effect in between. int *__counted_by(count) buf; size_t count; } sized_buf_t; - void alloc_buf(sized_buf_t *sbuf, sized_t nelems) { + void alloc_buf(sized_buf_t *sbuf, size_t nelems) { sbuf->buf = (int *)malloc(sizeof(int) * nelems); sbuf->count = nelems; } From 30e7c101465d5fa4e9266b9ae3b238eb8cf4533b Mon Sep 17 00:00:00 2001 From: David Stuttard Date: Mon, 10 Feb 2025 07:58:02 +0000 Subject: [PATCH 121/293] [AMDGPU] - Fix non-deterministic compile issue (#126271) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 4ce1f9079d4d3 [AMDGPU] Allow rematerialization of instructions with virtual register uses (#124327) made changes that require an ordered traversal of a DenseMap. Changing it to MapVector which respects insertion order. --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 7d3e63df43da6..e3da8d3005629 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -442,7 +442,7 @@ class PreRARematStage : public GCNSchedStage { // Map a trivially rematerializable def to a list of regions at MinOccupancy // that has the defined reg as a live-in. - DenseMap> RematDefToLiveInRegions; + MapVector> RematDefToLiveInRegions; // Collect all trivially rematerializable VGPR instructions with a single def // and single use outside the defining block into RematerializableInsts. From 67b7a2590f39ad9ff5413adb9af162220972833e Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Mon, 10 Feb 2025 09:09:58 +0100 Subject: [PATCH 122/293] Revert "[mlir] Python: Parse ModuleOp from file path" (#126482) Reverts llvm/llvm-project#125736 The gcc7 Bot is broken at the moment. --- mlir/include/mlir-c/IR.h | 4 ---- mlir/include/mlir/Bindings/Python/Nanobind.h | 1 - mlir/lib/Bindings/Python/IRCore.cpp | 16 +--------------- mlir/lib/CAPI/IR/IR.cpp | 10 ---------- mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 3 +-- mlir/test/python/ir/module.py | 20 -------------------- 6 files changed, 2 insertions(+), 52 deletions(-) diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h index 14ccae650606a..7d2fd89e8560f 100644 --- a/mlir/include/mlir-c/IR.h +++ b/mlir/include/mlir-c/IR.h @@ -309,10 +309,6 @@ MLIR_CAPI_EXPORTED MlirModule mlirModuleCreateEmpty(MlirLocation location); MLIR_CAPI_EXPORTED MlirModule mlirModuleCreateParse(MlirContext context, MlirStringRef module); -/// Parses a module from file and transfers ownership to the caller. -MLIR_CAPI_EXPORTED MlirModule -mlirModuleCreateParseFromFile(MlirContext context, MlirStringRef fileName); - /// Gets the context that a module was created with. MLIR_CAPI_EXPORTED MlirContext mlirModuleGetContext(MlirModule module); diff --git a/mlir/include/mlir/Bindings/Python/Nanobind.h b/mlir/include/mlir/Bindings/Python/Nanobind.h index bc8bddf4caf7e..ca942c83d3e2f 100644 --- a/mlir/include/mlir/Bindings/Python/Nanobind.h +++ b/mlir/include/mlir/Bindings/Python/Nanobind.h @@ -23,7 +23,6 @@ #endif #include #include -#include #include #include #include diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 2e4b6d1ce35c1..47a85c2a486fd 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include #include #include @@ -300,7 +299,7 @@ struct PyAttrBuilderMap { return *builder; } static void dunderSetItemNamed(const std::string &attributeKind, - nb::callable func, bool replace) { + nb::callable func, bool replace) { PyGlobals::get().registerAttributeBuilder(attributeKind, std::move(func), replace); } @@ -3050,19 +3049,6 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, nb::arg("asm"), nb::arg("context").none() = nb::none(), kModuleParseDocstring) - .def_static( - "parse", - [](const std::filesystem::path &path, - DefaultingPyMlirContext context) { - PyMlirContext::ErrorCapture errors(context->getRef()); - MlirModule module = mlirModuleCreateParseFromFile( - context->get(), toMlirStringRef(path.string())); - if (mlirModuleIsNull(module)) - throw MLIRError("Unable to parse module assembly", errors.take()); - return PyModule::forModule(module).releaseObject(); - }, - nb::arg("asm"), nb::arg("context").none() = nb::none(), - kModuleParseDocstring) .def_static( "create", [](DefaultingPyLocation loc) { diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp index 999e8cbda1295..f27af0ca9a2c7 100644 --- a/mlir/lib/CAPI/IR/IR.cpp +++ b/mlir/lib/CAPI/IR/IR.cpp @@ -22,7 +22,6 @@ #include "mlir/IR/Location.h" #include "mlir/IR/Operation.h" #include "mlir/IR/OperationSupport.h" -#include "mlir/IR/OwningOpRef.h" #include "mlir/IR/Types.h" #include "mlir/IR/Value.h" #include "mlir/IR/Verifier.h" @@ -329,15 +328,6 @@ MlirModule mlirModuleCreateParse(MlirContext context, MlirStringRef module) { return MlirModule{owning.release().getOperation()}; } -MlirModule mlirModuleCreateParseFromFile(MlirContext context, - MlirStringRef fileName) { - OwningOpRef owning = - parseSourceFile(unwrap(fileName), unwrap(context)); - if (!owning) - return MlirModule{nullptr}; - return MlirModule{owning.release().getOperation()}; -} - MlirContext mlirModuleGetContext(MlirModule module) { return wrap(unwrap(module).getContext()); } diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index 096b87b362443..fb7efb8cd28a5 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -46,7 +46,6 @@ import abc import collections from collections.abc import Callable, Sequence import io -from pathlib import Path from typing import Any, ClassVar, TypeVar, overload __all__ = [ @@ -2124,7 +2123,7 @@ class Module: Creates an empty module """ @staticmethod - def parse(asm: str | bytes | Path, context: Context | None = None) -> Module: + def parse(asm: str | bytes, context: Context | None = None) -> Module: """ Parses a module's assembly format from a string. diff --git a/mlir/test/python/ir/module.py b/mlir/test/python/ir/module.py index 441916b38ee73..ecafcb46af217 100644 --- a/mlir/test/python/ir/module.py +++ b/mlir/test/python/ir/module.py @@ -1,8 +1,6 @@ # RUN: %PYTHON %s | FileCheck %s import gc -from pathlib import Path -from tempfile import NamedTemporaryFile from mlir.ir import * @@ -29,24 +27,6 @@ def testParseSuccess(): print(str(module)) -# Verify successful parse from file. -# CHECK-LABEL: TEST: testParseFromFileSuccess -# CHECK: module @successfulParse -@run -def testParseFromFileSuccess(): - ctx = Context() - with NamedTemporaryFile(mode="w") as tmp_file: - tmp_file.write(r"""module @successfulParse {}""") - tmp_file.flush() - module = Module.parse(Path(tmp_file.name), ctx) - assert module.context is ctx - print("CLEAR CONTEXT") - ctx = None # Ensure that module captures the context. - gc.collect() - module.operation.verify() - print(str(module)) - - # Verify parse error. # CHECK-LABEL: TEST: testParseError # CHECK: testParseError: < From 5f84b6edd97153f1e5ec00ce110108ba8f6048bd Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Mon, 10 Feb 2025 08:23:34 +0000 Subject: [PATCH 123/293] [AArch64] Add MATCH loops to LoopIdiomVectorizePass (#101976) This patch adds a new loop to LoopIdiomVectorizePass, enabling it to recognise and vectorise loops such as: ```cpp template InputIt find_first_of(InputIt first, InputIt last, ForwardIt s_first, ForwardIt s_last) { for (; first != last; ++first) for (ForwardIt it = s_first; it != s_last; ++it) if (*first == *it) return first; return last; } ``` These loops match the C++ standard library function `std::find_first_of`. --- .../Vectorize/LoopIdiomVectorize.cpp | 496 ++++++++++++- .../LoopIdiom/AArch64/find-first-byte.ll | 671 ++++++++++++++++++ 2 files changed, 1158 insertions(+), 9 deletions(-) create mode 100644 llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 7af7408ed67a8..90329200dd7e4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -10,8 +10,10 @@ // transforms them into more optimized versions of the same loop. In cases // where this happens, it can be a significant performance win. // -// We currently only recognize one loop that finds the first mismatched byte -// in an array and returns the index, i.e. something like: +// We currently support two loops: +// +// 1. A loop that finds the first mismatched byte in an array and returns the +// index, i.e. something like: // // while (++i != n) { // if (a[i] != b[i]) @@ -24,12 +26,6 @@ // boundaries. However, even with these checks it is still profitable to do the // transformation. // -//===----------------------------------------------------------------------===// -// -// NOTE: This Pass matches a really specific loop pattern because it's only -// supposed to be a temporary solution until our LoopVectorizer is powerful -// enought to vectorize it automatically. -// // TODO List: // // * Add support for the inverse case where we scan for a matching element. @@ -37,6 +33,35 @@ // * Recognize loops that increment the IV *after* comparing bytes. // * Allow 32-bit sign-extends of the IV used by the GEP. // +// 2. A loop that finds the first matching character in an array among a set of +// possible matches, e.g.: +// +// for (; first != last; ++first) +// for (s_it = s_first; s_it != s_last; ++s_it) +// if (*first == *s_it) +// return first; +// return last; +// +// This corresponds to std::find_first_of (for arrays of bytes) from the C++ +// standard library. This function can be implemented efficiently for targets +// that support @llvm.experimental.vector.match. For example, on AArch64 targets +// that implement SVE2, this lower to a MATCH instruction, which enables us to +// perform up to 16x16=256 comparisons in one go. This can lead to very +// significant speedups. +// +// TODO: +// +// * Add support for `find_first_not_of' loops (i.e. with not-equal comparison). +// * Make VF a configurable parameter (right now we assume 128-bit vectors). +// * Potentially adjust the cost model to let the transformation kick-in even if +// @llvm.experimental.vector.match doesn't have direct support in hardware. +// +//===----------------------------------------------------------------------===// +// +// NOTE: This Pass matches really specific loop patterns because it's only +// supposed to be a temporary solution until our LoopVectorizer is powerful +// enough to vectorize them automatically. +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" @@ -79,6 +104,11 @@ static cl::opt cl::desc("The vectorization factor for byte-compare patterns."), cl::init(16)); +static cl::opt + DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte", + cl::Hidden, cl::init(false), + cl::desc("Do not convert find-first-byte loop(s).")); + static cl::opt VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false), cl::desc("Verify loops generated Loop Idiom Vectorize Pass.")); @@ -136,6 +166,19 @@ class LoopIdiomVectorize { PHINode *IndPhi, Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, BasicBlock *FoundBB, BasicBlock *EndBB); + + bool recognizeFindFirstByte(); + + Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU, + unsigned VF, Type *CharTy, BasicBlock *ExitSucc, + BasicBlock *ExitFail, Value *SearchStart, + Value *SearchEnd, Value *NeedleStart, + Value *NeedleEnd); + + void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy, + BasicBlock *ExitSucc, BasicBlock *ExitFail, + Value *SearchStart, Value *SearchEnd, + Value *NeedleStart, Value *NeedleEnd); /// @} }; } // anonymous namespace @@ -190,7 +233,13 @@ bool LoopIdiomVectorize::run(Loop *L) { LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); - return recognizeByteCompare(); + if (recognizeByteCompare()) + return true; + + if (recognizeFindFirstByte()) + return true; + + return false; } bool LoopIdiomVectorize::recognizeByteCompare() { @@ -939,3 +988,432 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, report_fatal_error("Loops must remain in LCSSA form!"); } } + +bool LoopIdiomVectorize::recognizeFindFirstByte() { + // Currently the transformation only works on scalable vector types, although + // there is no fundamental reason why it cannot be made to work for fixed + // vectors. We also need to know the target's minimum page size in order to + // generate runtime memory checks to ensure the vector version won't fault. + if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || + DisableFindFirstByte) + return false; + + // Define some constants we need throughout. + BasicBlock *Header = CurLoop->getHeader(); + LLVMContext &Ctx = Header->getContext(); + + // We are expecting the four blocks defined below: Header, MatchBB, InnerBB, + // and OuterBB. For now, we will bail our for almost anything else. The Four + // blocks contain one nested loop. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 || + CurLoop->getSubLoops().size() != 1) + return false; + + auto *InnerLoop = CurLoop->getSubLoops().front(); + PHINode *IndPhi = dyn_cast(&Header->front()); + if (!IndPhi || IndPhi->getNumIncomingValues() != 2) + return false; + + // Check instruction counts. + auto LoopBlocks = CurLoop->getBlocks(); + if (LoopBlocks[0]->sizeWithoutDebug() > 3 || + LoopBlocks[1]->sizeWithoutDebug() > 4 || + LoopBlocks[2]->sizeWithoutDebug() > 3 || + LoopBlocks[3]->sizeWithoutDebug() > 3) + return false; + + // Check that no instruction other than IndPhi has outside uses. + for (BasicBlock *BB : LoopBlocks) + for (Instruction &I : *BB) + if (&I != IndPhi) + for (User *U : I.users()) + if (!CurLoop->contains(cast(U))) + return false; + + // Match the branch instruction in the header. We are expecting an + // unconditional branch to the inner loop. + // + // Header: + // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ] + // %15 = load i8, ptr %14, align 1 + // br label %MatchBB + BasicBlock *MatchBB; + if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) || + !InnerLoop->contains(MatchBB)) + return false; + + // MatchBB should be the entrypoint into the inner loop containing the + // comparison between a search element and a needle. + // + // MatchBB: + // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ] + // %21 = load i8, ptr %20, align 1 + // %22 = icmp eq i8 %15, %21 + // br i1 %22, label %ExitSucc, label %InnerBB + BasicBlock *ExitSucc, *InnerBB; + Value *LoadSearch, *LoadNeedle; + CmpPredicate MatchPred; + if (!match(MatchBB->getTerminator(), + m_Br(m_ICmp(MatchPred, m_Value(LoadSearch), m_Value(LoadNeedle)), + m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) || + MatchPred != ICmpInst::ICMP_EQ || !InnerLoop->contains(InnerBB)) + return false; + + // We expect outside uses of `IndPhi' in ExitSucc (and only there). + for (User *U : IndPhi->users()) + if (!CurLoop->contains(cast(U))) { + auto *PN = dyn_cast(U); + if (!PN || PN->getParent() != ExitSucc) + return false; + } + + // Match the loads and check they are simple. + Value *Search, *Needle; + if (!match(LoadSearch, m_Load(m_Value(Search))) || + !match(LoadNeedle, m_Load(m_Value(Needle))) || + !cast(LoadSearch)->isSimple() || + !cast(LoadNeedle)->isSimple()) + return false; + + // Check we are loading valid characters. + Type *CharTy = LoadSearch->getType(); + if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy) + return false; + + // Pick the vectorisation factor based on CharTy, work out the cost of the + // match intrinsic and decide if we should use it. + // Note: For the time being we assume 128-bit vectors. + unsigned VF = 128 / CharTy->getIntegerBitWidth(); + SmallVector Args = { + ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF), + ScalableVectorType::get(Type::getInt1Ty(Ctx), VF)}; + IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2], + Args); + if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4) + return false; + + // The loads come from two PHIs, each with two incoming values. + PHINode *PSearch = dyn_cast(Search); + PHINode *PNeedle = dyn_cast(Needle); + if (!PSearch || PSearch->getNumIncomingValues() != 2 || !PNeedle || + PNeedle->getNumIncomingValues() != 2) + return false; + + // One PHI comes from the outer loop (PSearch), the other one from the inner + // loop (PNeedle). PSearch effectively corresponds to IndPhi. + if (InnerLoop->contains(PSearch)) + std::swap(PSearch, PNeedle); + if (PSearch != &Header->front() || PNeedle != &MatchBB->front()) + return false; + + // The incoming values of both PHI nodes should be a gep of 1. + Value *SearchStart = PSearch->getIncomingValue(0); + Value *SearchIndex = PSearch->getIncomingValue(1); + if (CurLoop->contains(PSearch->getIncomingBlock(0))) + std::swap(SearchStart, SearchIndex); + + Value *NeedleStart = PNeedle->getIncomingValue(0); + Value *NeedleIndex = PNeedle->getIncomingValue(1); + if (InnerLoop->contains(PNeedle->getIncomingBlock(0))) + std::swap(NeedleStart, NeedleIndex); + + // Match the GEPs. + if (!match(SearchIndex, m_GEP(m_Specific(PSearch), m_One())) || + !match(NeedleIndex, m_GEP(m_Specific(PNeedle), m_One()))) + return false; + + // Check the GEPs result type matches `CharTy'. + GetElementPtrInst *GEPSearch = cast(SearchIndex); + GetElementPtrInst *GEPNeedle = cast(NeedleIndex); + if (GEPSearch->getResultElementType() != CharTy || + GEPNeedle->getResultElementType() != CharTy) + return false; + + // InnerBB should increment the address of the needle pointer. + // + // InnerBB: + // %17 = getelementptr inbounds i8, ptr %20, i64 1 + // %18 = icmp eq ptr %17, %10 + // br i1 %18, label %OuterBB, label %MatchBB + BasicBlock *OuterBB; + Value *NeedleEnd; + if (!match(InnerBB->getTerminator(), + m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPNeedle), + m_Value(NeedleEnd)), + m_BasicBlock(OuterBB), m_Specific(MatchBB))) || + !CurLoop->contains(OuterBB)) + return false; + + // OuterBB should increment the address of the search element pointer. + // + // OuterBB: + // %24 = getelementptr inbounds i8, ptr %14, i64 1 + // %25 = icmp eq ptr %24, %6 + // br i1 %25, label %ExitFail, label %Header + BasicBlock *ExitFail; + Value *SearchEnd; + if (!match(OuterBB->getTerminator(), + m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPSearch), + m_Value(SearchEnd)), + m_BasicBlock(ExitFail), m_Specific(Header)))) + return false; + + if (!CurLoop->isLoopInvariant(SearchStart) || + !CurLoop->isLoopInvariant(SearchEnd) || + !CurLoop->isLoopInvariant(NeedleStart) || + !CurLoop->isLoopInvariant(NeedleEnd)) + return false; + + LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n"); + + transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart, + SearchEnd, NeedleStart, NeedleEnd); + return true; +} + +Value *LoopIdiomVectorize::expandFindFirstByte( + IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy, + BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart, + Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) { + // Set up some types and constants that we intend to reuse. + auto *PtrTy = Builder.getPtrTy(); + auto *I64Ty = Builder.getInt64Ty(); + auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF); + auto *CharVTy = ScalableVectorType::get(CharTy, VF); + auto *ConstVF = ConstantInt::get(I64Ty, VF); + + // Other common arguments. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + LLVMContext &Ctx = Preheader->getContext(); + Value *Passthru = ConstantInt::getNullValue(CharVTy); + + // Split block in the original loop preheader. + // SPH is the new preheader to the old scalar loop. + BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "scalar_preheader"); + + // Create the blocks that we're going to use. + // + // We will have the following loops: + // (O) Outer loop where we iterate over the elements of the search array. + // (I) Inner loop where we iterate over the elements of the needle array. + // + // Overall, the blocks do the following: + // (0) Check if the arrays can't cross page boundaries. If so go to (1), + // otherwise fall back to the original scalar loop. + // (1) Load the search array. Go to (2). + // (2) (a) Load the needle array. + // (b) Splat the first element to the inactive lanes. + // (c) Check if any elements match. If so go to (3), otherwise go to (4). + // (3) Compute the index of the first match and exit. + // (4) Check if we've reached the end of the needle array. If not loop back to + // (2), otherwise go to (5). + // (5) Check if we've reached the end of the search array. If not loop back to + // (1), otherwise exit. + // Blocks (0,3) are not part of any loop. Blocks (1,5) and (2,4) belong to + // the outer and inner loops, respectively. + BasicBlock *BB0 = BasicBlock::Create(Ctx, "mem_check", SPH->getParent(), SPH); + BasicBlock *BB1 = + BasicBlock::Create(Ctx, "find_first_vec_header", SPH->getParent(), SPH); + BasicBlock *BB2 = + BasicBlock::Create(Ctx, "match_check_vec", SPH->getParent(), SPH); + BasicBlock *BB3 = + BasicBlock::Create(Ctx, "calculate_match", SPH->getParent(), SPH); + BasicBlock *BB4 = + BasicBlock::Create(Ctx, "needle_check_vec", SPH->getParent(), SPH); + BasicBlock *BB5 = + BasicBlock::Create(Ctx, "search_check_vec", SPH->getParent(), SPH); + + // Update LoopInfo with the new loops. + auto OuterLoop = LI->AllocateLoop(); + auto InnerLoop = LI->AllocateLoop(); + + if (auto ParentLoop = CurLoop->getParentLoop()) { + ParentLoop->addBasicBlockToLoop(BB0, *LI); + ParentLoop->addChildLoop(OuterLoop); + ParentLoop->addBasicBlockToLoop(BB3, *LI); + } else { + LI->addTopLevelLoop(OuterLoop); + } + + // Add the inner loop to the outer. + OuterLoop->addChildLoop(InnerLoop); + + // Add the new basic blocks to the corresponding loops. + OuterLoop->addBasicBlockToLoop(BB1, *LI); + OuterLoop->addBasicBlockToLoop(BB5, *LI); + InnerLoop->addBasicBlockToLoop(BB2, *LI); + InnerLoop->addBasicBlockToLoop(BB4, *LI); + + // Update the terminator added by SplitBlock to branch to the first block. + Preheader->getTerminator()->setSuccessor(0, BB0); + DTU.applyUpdates({{DominatorTree::Delete, Preheader, SPH}, + {DominatorTree::Insert, Preheader, BB0}}); + + // (0) Check if we could be crossing a page boundary; if so, fallback to the + // old scalar loops. Also create a predicate of VF elements to be used in the + // vector loops. + Builder.SetInsertPoint(BB0); + Value *ISearchStart = + Builder.CreatePtrToInt(SearchStart, I64Ty, "search_start_int"); + Value *ISearchEnd = + Builder.CreatePtrToInt(SearchEnd, I64Ty, "search_end_int"); + Value *INeedleStart = + Builder.CreatePtrToInt(NeedleStart, I64Ty, "needle_start_int"); + Value *INeedleEnd = + Builder.CreatePtrToInt(NeedleEnd, I64Ty, "needle_end_int"); + Value *PredVF = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, + {ConstantInt::get(I64Ty, 0), ConstVF}); + + const uint64_t MinPageSize = TTI->getMinPageSize().value(); + const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize); + Value *SearchStartPage = + Builder.CreateLShr(ISearchStart, AddrShiftAmt, "search_start_page"); + Value *SearchEndPage = + Builder.CreateLShr(ISearchEnd, AddrShiftAmt, "search_end_page"); + Value *NeedleStartPage = + Builder.CreateLShr(INeedleStart, AddrShiftAmt, "needle_start_page"); + Value *NeedleEndPage = + Builder.CreateLShr(INeedleEnd, AddrShiftAmt, "needle_end_page"); + Value *SearchPageCmp = + Builder.CreateICmpNE(SearchStartPage, SearchEndPage, "search_page_cmp"); + Value *NeedlePageCmp = + Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage, "needle_page_cmp"); + + Value *CombinedPageCmp = + Builder.CreateOr(SearchPageCmp, NeedlePageCmp, "combined_page_cmp"); + BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1); + CombinedPageBr->setMetadata(LLVMContext::MD_prof, + MDBuilder(Ctx).createBranchWeights(10, 90)); + DTU.applyUpdates( + {{DominatorTree::Insert, BB0, SPH}, {DominatorTree::Insert, BB0, BB1}}); + + // (1) Load the search array and branch to the inner loop. + Builder.SetInsertPoint(BB1); + PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch"); + Value *PredSearch = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, + {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}, nullptr, + "search_pred"); + PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked"); + Value *LoadSearch = Builder.CreateMaskedLoad( + CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec"); + Builder.CreateBr(BB2); + DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}}); + + // (2) Inner loop. + Builder.SetInsertPoint(BB2); + PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle"); + + // (2.a) Load the needle array. + Value *PredNeedle = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, + {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}, nullptr, + "needle_pred"); + PredNeedle = Builder.CreateAnd(PredVF, PredNeedle, "needle_masked"); + Value *LoadNeedle = Builder.CreateMaskedLoad( + CharVTy, Needle, Align(1), PredNeedle, Passthru, "needle_load_vec"); + + // (2.b) Splat the first element to the inactive lanes. + Value *Needle0 = + Builder.CreateExtractElement(LoadNeedle, uint64_t(0), "needle0"); + Value *Needle0Splat = Builder.CreateVectorSplat(ElementCount::getScalable(VF), + Needle0, "needle0"); + LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat, + "needle_splat"); + LoadNeedle = + Builder.CreateExtractVector(FixedVectorType::get(CharTy, VF), LoadNeedle, + ConstantInt::get(I64Ty, 0), "needle_vec"); + + // (2.c) Test if there's a match. + Value *MatchPred = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()}, + {LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_pred"); + Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred); + Builder.CreateCondBr(IfAnyMatch, BB3, BB4); + DTU.applyUpdates( + {{DominatorTree::Insert, BB2, BB3}, {DominatorTree::Insert, BB2, BB4}}); + + // (3) We found a match. Compute the index of its location and exit. + Builder.SetInsertPoint(BB3); + PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1, "match_start"); + PHINode *MatchPredLCSSA = + Builder.CreatePHI(MatchPred->getType(), 1, "match_vec"); + Value *MatchCnt = Builder.CreateIntrinsic( + Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()}, + {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}, nullptr, + "match_idx"); + Value *MatchVal = + Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt, "match_res"); + Builder.CreateBr(ExitSucc); + DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}}); + + // (4) Check if we've reached the end of the needle array. + Builder.SetInsertPoint(BB4); + Value *NextNeedle = + Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec"); + Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5); + DTU.applyUpdates( + {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}}); + + // (5) Check if we've reached the end of the search array. + Builder.SetInsertPoint(BB5); + Value *NextSearch = + Builder.CreateGEP(CharTy, Search, ConstVF, "search_next_vec"); + Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1, + ExitFail); + DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1}, + {DominatorTree::Insert, BB5, ExitFail}}); + + // Set up the PHI nodes. + Search->addIncoming(SearchStart, BB0); + Search->addIncoming(NextSearch, BB5); + Needle->addIncoming(NeedleStart, BB1); + Needle->addIncoming(NextNeedle, BB4); + // These are needed to retain LCSSA form. + MatchLCSSA->addIncoming(Search, BB2); + MatchPredLCSSA->addIncoming(MatchPred, BB2); + + if (VerifyLoops) { + OuterLoop->verifyLoop(); + InnerLoop->verifyLoop(); + if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + } + + return MatchVal; +} + +void LoopIdiomVectorize::transformFindFirstByte( + PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc, + BasicBlock *ExitFail, Value *SearchStart, Value *SearchEnd, + Value *NeedleStart, Value *NeedleEnd) { + // Insert the find first byte code at the end of the preheader block. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BranchInst *PHBranch = cast(Preheader->getTerminator()); + IRBuilder<> Builder(PHBranch); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); + + Value *MatchVal = + expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail, + SearchStart, SearchEnd, NeedleStart, NeedleEnd); + + assert(PHBranch->isUnconditional() && + "Expected preheader to terminate with an unconditional branch."); + + // Add new incoming values with the result of the transformation to PHINodes + // of ExitSucc that use IndPhi. + for (auto *U : llvm::make_early_inc_range(IndPhi->users())) { + auto *PN = dyn_cast(U); + if (PN && PN->getParent() == ExitSucc) + PN->addIncoming(MatchVal, cast(MatchVal)->getParent()); + } + + if (VerifyLoops && CurLoop->getParentLoop()) { + CurLoop->getParentLoop()->verifyLoop(); + if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + } +} diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll new file mode 100644 index 0000000000000..8ef2a51506606 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll @@ -0,0 +1,671 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -verify-loop-info -verify-dom-info -S < %s | FileCheck %s +; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -disable-loop-idiom-vectorize-find-first-byte -S < %s | FileCheck -check-prefix=DISABLE %s + +; Base case based on `libcxx/include/__algorithm/find_first_of.h': +; char* find_first_of(char *first, char *last, char *s_first, char *s_last) { +; for (; first != last; ++first) +; for (char *it = s_first; it != s_last; ++it) +; if (*first == *it) +; return first; +; return last; +; } +define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { +; CHECK-LABEL: define ptr @find_first_of_i8( +; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]] +; CHECK: [[HEADER_PREHEADER]]: +; CHECK-NEXT: br label %[[MEM_CHECK:.*]] +; CHECK: [[MEM_CHECK]]: +; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64 +; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64 +; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64 +; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) +; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12 +; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12 +; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12 +; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12 +; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]] +; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]] +; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]] +; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0:![0-9]+]] +; CHECK: [[FIND_FIRST_VEC_HEADER]]: +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 +; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]]) +; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and [[TMP0]], [[SEARCH_PRED]] +; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, [[SEARCH_MASKED]], zeroinitializer) +; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]] +; CHECK: [[MATCH_CHECK_VEC]]: +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 +; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]]) +; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and [[TMP0]], [[NEEDLE_PRED]] +; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, [[NEEDLE_MASKED]], zeroinitializer) +; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement [[NEEDLE_LOAD_VEC]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement poison, i8 [[NEEDLE0]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector [[NEEDLE0_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select [[NEEDLE_MASKED]], [[NEEDLE_LOAD_VEC]], [[NEEDLE0_SPLAT]] +; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8( [[NEEDLE_SPLAT]], i64 0) +; CHECK-NEXT: [[MATCH_PRED:%.*]] = call @llvm.experimental.vector.match.nxv16i8.v16i8( [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], [[SEARCH_MASKED]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[MATCH_PRED]]) +; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]] +; CHECK: [[CALCULATE_MATCH]]: +; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[MATCH_VEC]], i1 true) +; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]] +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] +; CHECK: [[NEEDLE_CHECK_VEC]]: +; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]] +; CHECK: [[SEARCH_CHECK_VEC]]: +; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]] +; CHECK: [[SCALAR_PREHEADER]]: +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ] +; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1 +; CHECK-NEXT: br label %[[MATCH_CHECK:.*]] +; CHECK: [[NEEDLE_CHECK:.*]]: +; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1 +; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; CHECK: [[MATCH_CHECK]]: +; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1 +; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]] +; CHECK: [[SEARCH_CHECK]]: +; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1 +; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT_LOOPEXIT1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[RES]] +; +; DISABLE-LABEL: define ptr @find_first_of_i8( +; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0:[0-9]+]] { +; DISABLE-NEXT: [[ENTRY:.*]]: +; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]] +; DISABLE: [[HEADER_PREHEADER]]: +; DISABLE-NEXT: br label %[[HEADER:.*]] +; DISABLE: [[HEADER]]: +; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ] +; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1 +; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]] +; DISABLE: [[NEEDLE_CHECK:.*]]: +; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1 +; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; DISABLE: [[MATCH_CHECK]]: +; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1 +; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]] +; DISABLE: [[SEARCH_CHECK]]: +; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1 +; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]] +; DISABLE: [[EXIT_LOOPEXIT]]: +; DISABLE-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ] +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT_LOOPEXIT1]]: +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT]]: +; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ] +; DISABLE-NEXT: ret ptr [[RES]] +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + ret ptr %res +} + +; Equivalent to @find_first_of_i8 but with i16. +; This is accepted and generates a similar loop. +define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { +; CHECK-LABEL: define ptr @find_first_of_i16( +; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]] +; CHECK: [[HEADER_PREHEADER]]: +; CHECK-NEXT: br label %[[MEM_CHECK:.*]] +; CHECK: [[MEM_CHECK]]: +; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64 +; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64 +; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64 +; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) +; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12 +; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12 +; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12 +; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12 +; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]] +; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]] +; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]] +; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]] +; CHECK: [[FIND_FIRST_VEC_HEADER]]: +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 +; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]]) +; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and [[TMP0]], [[SEARCH_PRED]] +; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, [[SEARCH_MASKED]], zeroinitializer) +; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]] +; CHECK: [[MATCH_CHECK_VEC]]: +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 +; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]]) +; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and [[TMP0]], [[NEEDLE_PRED]] +; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, [[NEEDLE_MASKED]], zeroinitializer) +; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement [[NEEDLE_LOAD_VEC]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement poison, i16 [[NEEDLE0]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector [[NEEDLE0_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select [[NEEDLE_MASKED]], [[NEEDLE_LOAD_VEC]], [[NEEDLE0_SPLAT]] +; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16( [[NEEDLE_SPLAT]], i64 0) +; CHECK-NEXT: [[MATCH_PRED:%.*]] = call @llvm.experimental.vector.match.nxv8i16.v8i16( [[SEARCH_LOAD_VEC]], <8 x i16> [[NEEDLE_VEC]], [[SEARCH_MASKED]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1( [[MATCH_PRED]]) +; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]] +; CHECK: [[CALCULATE_MATCH]]: +; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( [[MATCH_VEC]], i1 true) +; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i16, ptr [[MATCH_START]], i64 [[MATCH_IDX]] +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] +; CHECK: [[NEEDLE_CHECK_VEC]]: +; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i16, ptr [[PNEEDLE]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]] +; CHECK: [[SEARCH_CHECK_VEC]]: +; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i16, ptr [[PSEARCH]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]] +; CHECK: [[SCALAR_PREHEADER]]: +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ] +; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1 +; CHECK-NEXT: br label %[[MATCH_CHECK:.*]] +; CHECK: [[NEEDLE_CHECK:.*]]: +; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i16, ptr [[NEEDLE_PTR:%.*]], i64 1 +; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; CHECK: [[MATCH_CHECK]]: +; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1 +; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]] +; CHECK: [[SEARCH_CHECK]]: +; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1 +; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT_LOOPEXIT1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[RES]] +; +; DISABLE-LABEL: define ptr @find_first_of_i16( +; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] { +; DISABLE-NEXT: [[ENTRY:.*]]: +; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]] +; DISABLE: [[HEADER_PREHEADER]]: +; DISABLE-NEXT: br label %[[HEADER:.*]] +; DISABLE: [[HEADER]]: +; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ] +; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1 +; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]] +; DISABLE: [[NEEDLE_CHECK:.*]]: +; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i16, ptr [[NEEDLE_PTR:%.*]], i64 1 +; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; DISABLE: [[MATCH_CHECK]]: +; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1 +; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]] +; DISABLE: [[SEARCH_CHECK]]: +; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1 +; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]] +; DISABLE: [[EXIT_LOOPEXIT]]: +; DISABLE-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ] +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT_LOOPEXIT1]]: +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT]]: +; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ] +; DISABLE-NEXT: ret ptr [[RES]] +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i16, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i16, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i16, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i16 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i16, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + ret ptr %res +} + +; Same as @find_first_of_i8 but with two intermediate exit blocks for the +; "success" (exit_succ) and "failure" (exit_fail) paths. +define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { +; CHECK-LABEL: define ptr @find_first_of_i8_multi_exit( +; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT_FAIL:.*]], label %[[HEADER_PREHEADER:.*]] +; CHECK: [[HEADER_PREHEADER]]: +; CHECK-NEXT: br label %[[MEM_CHECK:.*]] +; CHECK: [[MEM_CHECK]]: +; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64 +; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64 +; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64 +; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) +; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12 +; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12 +; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12 +; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12 +; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]] +; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]] +; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]] +; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]] +; CHECK: [[FIND_FIRST_VEC_HEADER]]: +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 +; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]]) +; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and [[TMP0]], [[SEARCH_PRED]] +; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, [[SEARCH_MASKED]], zeroinitializer) +; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]] +; CHECK: [[MATCH_CHECK_VEC]]: +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 +; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]]) +; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and [[TMP0]], [[NEEDLE_PRED]] +; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, [[NEEDLE_MASKED]], zeroinitializer) +; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement [[NEEDLE_LOAD_VEC]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement poison, i8 [[NEEDLE0]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector [[NEEDLE0_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select [[NEEDLE_MASKED]], [[NEEDLE_LOAD_VEC]], [[NEEDLE0_SPLAT]] +; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8( [[NEEDLE_SPLAT]], i64 0) +; CHECK-NEXT: [[MATCH_PRED:%.*]] = call @llvm.experimental.vector.match.nxv16i8.v16i8( [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], [[SEARCH_MASKED]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[MATCH_PRED]]) +; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]] +; CHECK: [[CALCULATE_MATCH]]: +; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[MATCH_VEC]], i1 true) +; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]] +; CHECK-NEXT: br label %[[EXIT_SUCC:.*]] +; CHECK: [[NEEDLE_CHECK_VEC]]: +; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]] +; CHECK: [[SEARCH_CHECK_VEC]]: +; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_FAIL_LOOPEXIT:.*]] +; CHECK: [[SCALAR_PREHEADER]]: +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ] +; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1 +; CHECK-NEXT: br label %[[MATCH_CHECK:.*]] +; CHECK: [[NEEDLE_CHECK:.*]]: +; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1 +; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; CHECK: [[MATCH_CHECK]]: +; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1 +; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_SUCC]], label %[[NEEDLE_CHECK]] +; CHECK: [[SEARCH_CHECK]]: +; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1 +; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_FAIL_LOOPEXIT]], label %[[HEADER]] +; CHECK: [[EXIT_SUCC]]: +; CHECK-NEXT: [[RES_SUCC:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT_FAIL_LOOPEXIT]]: +; CHECK-NEXT: br label %[[EXIT_FAIL]] +; CHECK: [[EXIT_FAIL]]: +; CHECK-NEXT: [[RES_FAIL:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_END]], %[[EXIT_FAIL_LOOPEXIT]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[RES_SUCC]], %[[EXIT_SUCC]] ], [ [[RES_FAIL]], %[[EXIT_FAIL]] ] +; CHECK-NEXT: ret ptr [[RES]] +; +; DISABLE-LABEL: define ptr @find_first_of_i8_multi_exit( +; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] { +; DISABLE-NEXT: [[ENTRY:.*]]: +; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT_FAIL:.*]], label %[[HEADER_PREHEADER:.*]] +; DISABLE: [[HEADER_PREHEADER]]: +; DISABLE-NEXT: br label %[[HEADER:.*]] +; DISABLE: [[HEADER]]: +; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ] +; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1 +; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]] +; DISABLE: [[NEEDLE_CHECK:.*]]: +; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1 +; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; DISABLE: [[MATCH_CHECK]]: +; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1 +; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_SUCC:.*]], label %[[NEEDLE_CHECK]] +; DISABLE: [[SEARCH_CHECK]]: +; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1 +; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_FAIL_LOOPEXIT:.*]], label %[[HEADER]] +; DISABLE: [[EXIT_SUCC]]: +; DISABLE-NEXT: [[RES_SUCC:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ] +; DISABLE-NEXT: br label %[[EXIT:.*]] +; DISABLE: [[EXIT_FAIL_LOOPEXIT]]: +; DISABLE-NEXT: br label %[[EXIT_FAIL]] +; DISABLE: [[EXIT_FAIL]]: +; DISABLE-NEXT: [[RES_FAIL:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_END]], %[[EXIT_FAIL_LOOPEXIT]] ] +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT]]: +; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[RES_SUCC]], %[[EXIT_SUCC]] ], [ [[RES_FAIL]], %[[EXIT_FAIL]] ] +; DISABLE-NEXT: ret ptr [[RES]] +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit_fail, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit_succ, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit_fail, label %header + +exit_succ: + %res_succ = phi ptr [ %search_ptr, %match_check ] + br label %exit + +exit_fail: + %res_fail = phi ptr [ %search_end, %entry ], [ %search_end, %search_check ] + br label %exit + +exit: + %res = phi ptr [ %res_succ, %exit_succ ], [ %res_fail, %exit_fail ] + ret ptr %res +} + +; From here on we only test for the presence/absence of the intrinsic. +; UTC_ARGS: --disable + +; Same as @find_first_of_i8 but with `ne' comparison. +; This is rejected for now, but should eventually be supported. +define ptr @find_first_not_of_i8(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { +; CHECK-LABEL: define ptr @find_first_not_of_i8( +; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +; DISABLE-LABEL: define ptr @find_first_not_of_i8( +; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp ne i8 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + ret ptr %res +} + +; This is the same as @find_first_of_i8 but without SVE2, which we require to +; perform the conversion. +define ptr @find_first_of_i8_nosve2(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) { +; CHECK-LABEL: define ptr @find_first_of_i8_nosve2( +; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +; DISABLE-LABEL: define ptr @find_first_of_i8_nosve2( +; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + ret ptr %res +} + +; Same as @find_first_of_i8 but here we use the inner PHI outside the loop nest. +; This isn't supported. +define ptr @find_first_of_i8_outside_use(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { +; CHECK-LABEL: define ptr @find_first_of_i8_outside_use( +; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +; DISABLE-LABEL: define ptr @find_first_of_i8_outside_use( +; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + %use = phi ptr [ %needle_end, %entry ], [ %needle_ptr, %match_check ], [ %needle_end, %search_check ] + ret ptr %res +} + +; Same as @find_first_of_i8_multi_exit but `search_ptr' is used in `exit_fail' +; which should block the transform. +define ptr @find_first_of_i8_multi_exit_outside_use(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { +; CHECK-LABEL: define ptr @find_first_of_i8_multi_exit_outside_use( +; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +; DISABLE-LABEL: define ptr @find_first_of_i8_multi_exit_outside_use( +; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit_fail, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit_succ, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit_fail, label %header + +exit_succ: + %res_succ = phi ptr [ %search_ptr, %match_check ] + br label %exit + +exit_fail: + %res_fail = phi ptr [ %search_end, %entry ], [ %search_ptr, %search_check ] + br label %exit + +exit: + %res = phi ptr [ %res_succ, %exit_succ ], [ %res_fail, %exit_fail ] + ret ptr %res +} + +attributes #0 = { "target-features"="+sve2" } + +; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90} From d9cdf27834de94a7c6f5b66b28c0e6667fec5418 Mon Sep 17 00:00:00 2001 From: Aniket Lal Date: Mon, 10 Feb 2025 13:57:52 +0530 Subject: [PATCH 124/293] [Driver][HIP] Do not pass -dependency-file flag for HIP Device offloading (#125646) When we launch hipcc with multiple offload architectures along with -MF dep_file flag, the clang compilation invocations for host and device offloads write to the same dep_file, and can lead to collision during file IO operations. This can typically happen during large workloads. This commit provides a fix to generate dep_file only in host compilation. --------- Co-authored-by: anikelal --- clang/lib/Driver/ToolChains/Clang.cpp | 30 ++++++++++--------- ...-file-flag-with-multiple-offload-archs.hip | 13 ++++++++ 2 files changed, 29 insertions(+), 14 deletions(-) create mode 100644 clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index fe879e8f8bd27..82f4cabd620d7 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1042,21 +1042,23 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, ArgM = ArgMD; if (ArgM) { - // Determine the output location. - const char *DepFile; - if (Arg *MF = Args.getLastArg(options::OPT_MF)) { - DepFile = MF->getValue(); - C.addFailureResultFile(DepFile, &JA); - } else if (Output.getType() == types::TY_Dependencies) { - DepFile = Output.getFilename(); - } else if (!ArgMD) { - DepFile = "-"; - } else { - DepFile = getDependencyFileName(Args, Inputs); - C.addFailureResultFile(DepFile, &JA); + if (!JA.isDeviceOffloading(Action::OFK_HIP)) { + // Determine the output location. + const char *DepFile; + if (Arg *MF = Args.getLastArg(options::OPT_MF)) { + DepFile = MF->getValue(); + C.addFailureResultFile(DepFile, &JA); + } else if (Output.getType() == types::TY_Dependencies) { + DepFile = Output.getFilename(); + } else if (!ArgMD) { + DepFile = "-"; + } else { + DepFile = getDependencyFileName(Args, Inputs); + C.addFailureResultFile(DepFile, &JA); + } + CmdArgs.push_back("-dependency-file"); + CmdArgs.push_back(DepFile); } - CmdArgs.push_back("-dependency-file"); - CmdArgs.push_back(DepFile); bool HasTarget = false; for (const Arg *A : Args.filtered(options::OPT_MT, options::OPT_MQ)) { diff --git a/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip b/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip new file mode 100644 index 0000000000000..d26faf7242f91 --- /dev/null +++ b/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip @@ -0,0 +1,13 @@ +// RUN: %clang -### -nogpuinc -nogpulib --offload-arch=gfx1030 --offload-arch=gfx1100 --offload-arch=gfx1101 -MD -MF tmp.d %s 2>&1 | FileCheck %s + +// CHECK: Build config: +// CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1030"{{.*}}"-dependency-file" "tmp.d" +// CHECK: {{.*}}lld{{.*}}"-plugin-opt=mcpu=gfx1030" +// CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1100"{{.*}}"-dependency-file" "tmp.d" +// CHECK: {{.*}}lld{{.*}}"-plugin-opt=mcpu=gfx1100" +// CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1101"{{.*}}"-dependency-file" "tmp.d" +// CHECK: {{.*}}lld{{.*}}"-plugin-opt=mcpu=gfx1101" +// CHECK: {{.*}}clang-offload-bundler +// CHECK: {{.*}}clang{{.*}}"-target-cpu" "x86-64"{{.*}}"-dependency-file" "tmp.d" + +void main(){} From 91682da4388037489ecc62a5e5c06a290866e018 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Mon, 10 Feb 2025 16:40:07 +0800 Subject: [PATCH 125/293] [LoongArch] Pre-commit tests for tls-le merge base offset. NFC (#122998) Similar to tests in `merge-base-offset.ll`, except for tests of blockaddress. A later commit will optimize this. --- .../LoongArch/merge-base-offset-tlsle.ll | 971 ++++++++++++++++++ .../CodeGen/LoongArch/merge-base-offset.ll | 44 +- 2 files changed, 993 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll new file mode 100644 index 0000000000000..7e995d224ce1d --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll @@ -0,0 +1,971 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck --check-prefix=LA32 %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck --check-prefix=LA64 %s + +@g_i8 = dso_local thread_local(localexec) global i8 0 + +define dso_local signext i8 @tlsle_load_s8() nounwind { +; LA32-LABEL: tlsle_load_s8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) +; LA32-NEXT: ld.b $a0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_s8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) +; LA64-NEXT: ld.b $a0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) + %1 = load i8, ptr %0 + ret i8 %1 +} + +define dso_local zeroext i8 @tlsle_load_u8() nounwind { +; LA32-LABEL: tlsle_load_u8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) +; LA32-NEXT: ld.bu $a0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_u8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) +; LA64-NEXT: ld.bu $a0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) + %1 = load i8, ptr %0 + ret i8 %1 +} + +define dso_local void @tlsle_store_i8() nounwind { +; LA32-LABEL: tlsle_store_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.b $a1, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: st.b $a1, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) + store i8 1, ptr %0 + ret void +} + +@g_i16 = dso_local thread_local(localexec) global i16 0 + +define dso_local signext i16 @tlsle_load_s16() nounwind { +; LA32-LABEL: tlsle_load_s16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16) +; LA32-NEXT: ld.h $a0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_s16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16) +; LA64-NEXT: ld.h $a0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16) + %1 = load i16, ptr %0 + ret i16 %1 +} + +define dso_local zeroext i16 @tlsle_load_u16() nounwind { +; LA32-LABEL: tlsle_load_u16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16) +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_u16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16) +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16) + %1 = load i16, ptr %0 + ret i16 %1 +} + +define dso_local void @tlsle_store_i16() nounwind { +; LA32-LABEL: tlsle_store_i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16) +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.h $a1, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16) +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: st.h $a1, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16) + store i16 1, ptr %0 + ret void +} + +@g_i32 = dso_local thread_local(localexec) global i32 0 + +define dso_local signext i32 @tlsle_load_s32() nounwind { +; LA32-LABEL: tlsle_load_s32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32) +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_s32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32) +; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32) + %1 = load i32, ptr %0 + ret i32 %1 +} + +define dso_local zeroext i32 @tlsle_load_u32() nounwind { +; LA32-LABEL: tlsle_load_u32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32) +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_u32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32) +; LA64-NEXT: ld.wu $a0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32) + %1 = load i32, ptr %0 + ret i32 %1 +} + +define dso_local void @tlsle_store_i32() nounwind { +; LA32-LABEL: tlsle_store_i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32) +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32) +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: st.w $a1, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32) + store i32 1, ptr %0 + ret void +} + +@g_i64 = dso_local thread_local(localexec) global i64 0 + +define dso_local i64 @tlsle_load_i64() nounwind { +; LA32-LABEL: tlsle_load_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i64) +; LA32-NEXT: addi.w $a1, $a0, %le_lo12_r(g_i64) +; LA32-NEXT: ld.w $a0, $a1, 0 +; LA32-NEXT: ld.w $a1, $a1, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i64) +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i64) + %1 = load i64, ptr %0 + ret i64 %1 +} + +define dso_local void @tlsle_store_i64() nounwind { +; LA32-LABEL: tlsle_store_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i64) +; LA32-NEXT: st.w $zero, $a0, 4 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i64) +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i64) + store i64 1, ptr %0 + ret void +} + +@g_f32 = dso_local thread_local(localexec) global float 0.0 + +define dso_local float @tlsle_load_f32() nounwind { +; LA32-LABEL: tlsle_load_f32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_f32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_f32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_f32) +; LA32-NEXT: fld.s $fa0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_f32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_f32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_f32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_f32) +; LA64-NEXT: fld.s $fa0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_f32) + %1 = load float, ptr %0 + ret float %1 +} + +define dso_local void @tlsle_store_f32() nounwind { +; LA32-LABEL: tlsle_store_f32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_f32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_f32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_f32) +; LA32-NEXT: lu12i.w $a1, 260096 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_f32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_f32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_f32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_f32) +; LA64-NEXT: lu12i.w $a1, 260096 +; LA64-NEXT: st.w $a1, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_f32) + store float 1.0, ptr %0 + ret void +} + +@g_f64 = dso_local thread_local(localexec) global double 0.0 + +define dso_local double @tlsle_load_f64() nounwind { +; LA32-LABEL: tlsle_load_f64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_f64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_f64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_f64) +; LA32-NEXT: fld.d $fa0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_f64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_f64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_f64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_f64) +; LA64-NEXT: fld.d $fa0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_f64) + %1 = load double, ptr %0 + ret double %1 +} + +define dso_local void @tlsle_store_f64() nounwind { +; LA32-LABEL: tlsle_store_f64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_f64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_f64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_f64) +; LA32-NEXT: vldi $vr0, -912 +; LA32-NEXT: fst.d $fa0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_f64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_f64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_f64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_f64) +; LA64-NEXT: lu52i.d $a1, $zero, 1023 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_f64) + store double 1.0, ptr %0 + ret void +} + +@g_m64 = dso_local thread_local(localexec) global i64 0 + +define dso_local void @tlsle_store_multi() nounwind { +; LA32-LABEL: tlsle_store_multi: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_m64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_m64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_m64) +; LA32-NEXT: st.w $zero, $a0, 4 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $zero, $a0, 4 +; LA32-NEXT: ori $a1, $zero, 2 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_multi: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_m64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_m64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_m64) +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: ori $a1, $zero, 2 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_m64) + store volatile i64 1, ptr %0 + store volatile i64 2, ptr %0 + ret void +} + +@g_sf32 = dso_local thread_local(localexec) global float 0.0 + +define dso_local void @tlsle_store_sf32() nounwind { +; LA32-LABEL: tlsle_store_sf32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_sf32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_sf32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_sf32) +; LA32-NEXT: fld.s $fa0, $a0, 0 +; LA32-NEXT: fst.s $fa0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_sf32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_sf32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_sf32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_sf32) +; LA64-NEXT: fld.s $fa0, $a0, 0 +; LA64-NEXT: fst.s $fa0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_sf32) + %1 = load float, ptr %0 + store volatile float %1, ptr %0 + ret void +} + +@g_sf64 = dso_local thread_local(localexec) global double 0.0 + +define dso_local void @tlsle_store_sf64() nounwind { +; LA32-LABEL: tlsle_store_sf64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_sf64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_sf64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_sf64) +; LA32-NEXT: fld.d $fa0, $a0, 0 +; LA32-NEXT: fst.d $fa0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_sf64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_sf64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_sf64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_sf64) +; LA64-NEXT: fld.d $fa0, $a0, 0 +; LA64-NEXT: fst.d $fa0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_sf64) + %1 = load double, ptr %0 + store volatile double %1, ptr %0 + ret void +} + +@g_i32x4_src = dso_local thread_local(localexec) global [4 x i32] zeroinitializer, align 16 +@g_i32x4_dst = dso_local thread_local(localexec) global [4 x i32] zeroinitializer, align 16 + +define dso_local void @tlsle_copy_i32x4() nounwind { +; LA32-LABEL: tlsle_copy_i32x4: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x4_src) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32x4_src) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32x4_src) +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x4_dst) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32x4_dst) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32x4_dst) +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_copy_i32x4: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x4_src) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32x4_src) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32x4_src) +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x4_dst) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32x4_dst) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32x4_dst) +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32x4_src) + %1 = load <4 x i32>, ptr %0, align 16 + %2 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32x4_dst) + store <4 x i32> %1, ptr %2, align 16 + ret void +} + +@g_i32x8_src = dso_local thread_local(localexec) global [8 x i32] zeroinitializer, align 32 +@g_i32x8_dst = dso_local thread_local(localexec) global [8 x i32] zeroinitializer, align 32 + +define dso_local void @tlsle_copy_i32x8() nounwind { +; LA32-LABEL: tlsle_copy_i32x8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x8_src) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32x8_src) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32x8_src) +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x8_dst) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32x8_dst) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32x8_dst) +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_copy_i32x8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x8_src) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32x8_src) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32x8_src) +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x8_dst) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32x8_dst) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32x8_dst) +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32x8_src) + %1 = load <8 x i32>, ptr %0, align 32 + %2 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32x8_dst) + store <8 x i32> %1, ptr %2, align 32 + ret void +} + +@g_i8x16 = dso_local thread_local(localexec) global <16 x i8> zeroinitializer, align 16 + +define dso_local void @tlsle_copy_i8_to_i8x16() nounwind { +; LA32-LABEL: tlsle_copy_i8_to_i8x16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) +; LA32-NEXT: vldrepl.b $vr0, $a0, 0 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8x16) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8x16) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8x16) +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_copy_i8_to_i8x16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) +; LA64-NEXT: vldrepl.b $vr0, $a0, 0 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8x16) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8x16) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8x16) +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) + %1 = call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(ptr %0, i32 0) + %2 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8x16) + store <16 x i8> %1, ptr %2, align 16 + ret void +} + +@g_i8x32 = dso_local thread_local(localexec) global <32 x i8> zeroinitializer, align 32 + +define dso_local void @tlsle_copy_i8_to_i8x32() nounwind { +; LA32-LABEL: tlsle_copy_i8_to_i8x32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) +; LA32-NEXT: xvldrepl.b $xr0, $a0, 0 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8x32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8x32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8x32) +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_copy_i8_to_i8x32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) +; LA64-NEXT: xvldrepl.b $xr0, $a0, 0 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8x32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8x32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8x32) +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) + %1 = call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr %0, i32 0) + %2 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8x32) + store <32 x i8> %1, ptr %2, align 32 + ret void +} + +@g_rmw = dso_local thread_local(localexec) global i64 0 + +define dso_local void @tlsle_rmw() nounwind { +; LA32-LABEL: tlsle_rmw: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_rmw) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_rmw) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_rmw) +; LA32-NEXT: ld.w $a1, $a0, 0 +; LA32-NEXT: ld.w $a2, $a0, 4 +; LA32-NEXT: addi.w $a1, $a1, 1 +; LA32-NEXT: sltui $a3, $a1, 1 +; LA32-NEXT: add.w $a2, $a2, $a3 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a2, $a0, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_rmw: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_rmw) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_rmw) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_rmw) +; LA64-NEXT: ld.d $a1, $a0, 0 +; LA64-NEXT: addi.d $a1, $a1, 1 +; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: ret +entry: + %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_rmw) + %1 = load i64, ptr %0 + %2 = add i64 %1, 1 + store i64 %2, ptr %0 + ret void +} + +@g_a32 = dso_local thread_local(localexec) global [2048 x i32] zeroinitializer, align 4 + +define dso_local void @tlsle_store_a32() nounwind { +; LA32-LABEL: tlsle_store_a32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a32) +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_a32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a32) +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: stptr.w $a1, $a0, 4096 +; LA64-NEXT: ret +entry: + store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4 + ret void +} + +define dso_local void @tlsle_store_a32_2() nounwind { +; LA32-LABEL: tlsle_store_a32_2: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a32) +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: add.w $a2, $a0, $a1 +; LA32-NEXT: ori $a3, $zero, 1 +; LA32-NEXT: st.w $a3, $a2, 0 +; LA32-NEXT: ori $a1, $a1, 8 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ori $a1, $zero, 2 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_store_a32_2: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a32) +; LA64-NEXT: ori $a1, $zero, 1 +; LA64-NEXT: stptr.w $a1, $a0, 4096 +; LA64-NEXT: ori $a1, $zero, 2 +; LA64-NEXT: stptr.w $a1, $a0, 4104 +; LA64-NEXT: ret +entry: + store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4 + store i32 2, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1026), align 4 + ret void +} + +define dso_local void @tlsle_control_flow_with_mem_access() nounwind { +; LA32-LABEL: tlsle_control_flow_with_mem_access: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a32) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a32) +; LA32-NEXT: ld.w $a1, $a0, 4 +; LA32-NEXT: ori $a2, $zero, 1 +; LA32-NEXT: blt $a1, $a2, .LBB25_2 +; LA32-NEXT: # %bb.1: # %if.then +; LA32-NEXT: ori $a1, $zero, 10 +; LA32-NEXT: st.w $a1, $a0, 4 +; LA32-NEXT: .LBB25_2: # %if.end +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_control_flow_with_mem_access: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a32) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a32) +; LA64-NEXT: ld.w $a1, $a0, 4 +; LA64-NEXT: ori $a2, $zero, 1 +; LA64-NEXT: blt $a1, $a2, .LBB25_2 +; LA64-NEXT: # %bb.1: # %if.then +; LA64-NEXT: ori $a1, $zero, 10 +; LA64-NEXT: st.w $a1, $a0, 4 +; LA64-NEXT: .LBB25_2: # %if.end +; LA64-NEXT: ret +entry: + %0 = load i32, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1), align 4 + %cmp = icmp sgt i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 10, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1), align 4 + br label %if.end + +if.end: + ret void +} + +@g_a64 = dso_local thread_local(localexec) global [614750729487779976 x i64] zeroinitializer, align 8 + +define dso_local ptr @tlsle_load_addr_offset_1() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, 8 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1) +} + +define dso_local ptr @tlsle_load_addr_offset_257() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_257: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, 2047 +; LA32-NEXT: addi.w $a0, $a0, 9 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_257: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, 2047 +; LA64-NEXT: addi.d $a0, $a0, 9 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 257) +} + +define dso_local ptr @tlsle_load_addr_offset_1048576() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_1048576: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: lu12i.w $a1, 2048 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_1048576: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: addu16i.d $a0, $a0, 128 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048576) +} + +define dso_local ptr @tlsle_load_addr_offset_1048577() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_1048577: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: lu12i.w $a1, 2048 +; LA32-NEXT: ori $a1, $a1, 8 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_1048577: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: addu16i.d $a0, $a0, 128 +; LA64-NEXT: addi.d $a0, $a0, 8 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048577) +} + +define dso_local ptr @tlsle_load_addr_offset_268432896() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_268432896: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: lu12i.w $a1, 524283 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_268432896: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: lu12i.w $a1, 524283 +; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432896) +} + +define dso_local ptr @tlsle_load_addr_offset_268432897() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_268432897: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: lu12i.w $a1, 524283 +; LA32-NEXT: ori $a1, $a1, 8 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_268432897: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: lu12i.w $a1, 524283 +; LA64-NEXT: ori $a1, $a1, 8 +; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432897) +} + +define dso_local ptr @tlsle_load_addr_offset_281474439839744() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_281474439839744: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_281474439839744: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: ori $a1, $zero, 0 +; LA64-NEXT: lu32i.d $a1, 524287 +; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 281474439839744) +} + +define dso_local ptr @tlsle_load_addr_offset_248792680471040() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_248792680471040: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: lu12i.w $a1, 502733 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_248792680471040: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: lu12i.w $a1, 502733 +; LA64-NEXT: lu32i.d $a1, 463412 +; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 248792680471040) +} + +define dso_local ptr @tlsle_load_addr_offset_9380351707272() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_9380351707272: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: lu12i.w $a1, 279556 +; LA32-NEXT: ori $a1, $a1, 1088 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_9380351707272: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: lu12i.w $a1, 279556 +; LA64-NEXT: ori $a1, $a1, 1088 +; LA64-NEXT: lu32i.d $a1, 17472 +; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 9380351707272) +} + +define dso_local ptr @tlsle_load_addr_offset_562949953421312() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_562949953421312: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_562949953421312: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: lu52i.d $a1, $zero, 1 +; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 562949953421312) +} + +define dso_local ptr @tlsle_load_addr_offset_614749556925924693() nounwind { +; LA32-LABEL: tlsle_load_addr_offset_614749556925924693: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) +; LA32-NEXT: lu12i.w $a1, 209666 +; LA32-NEXT: ori $a1, $a1, 2728 +; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: tlsle_load_addr_offset_614749556925924693: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) +; LA64-NEXT: lu12i.w $a1, 209666 +; LA64-NEXT: ori $a1, $a1, 2728 +; LA64-NEXT: lu32i.d $a1, 15288 +; LA64-NEXT: lu52i.d $a1, $a1, 1092 +; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: ret +entry: + ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 614749556925924693) +} + +declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll index 9df5532d51179..2af206699d4ad 100644 --- a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll +++ b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll @@ -257,8 +257,8 @@ entry: @g_i64 = dso_local global i64 0 -define dso_local i64 @load_64() nounwind { -; LA32-LABEL: load_64: +define dso_local i64 @load_i64() nounwind { +; LA32-LABEL: load_i64: ; LA32: # %bb.0: # %entry ; LA32-NEXT: pcalau12i $a0, %pc_hi20(g_i64) ; LA32-NEXT: addi.w $a1, $a0, %pc_lo12(g_i64) @@ -266,13 +266,13 @@ define dso_local i64 @load_64() nounwind { ; LA32-NEXT: ld.w $a1, $a1, 4 ; LA32-NEXT: ret ; -; LA64-LABEL: load_64: +; LA64-LABEL: load_i64: ; LA64: # %bb.0: # %entry ; LA64-NEXT: pcalau12i $a0, %pc_hi20(g_i64) ; LA64-NEXT: ld.d $a0, $a0, %pc_lo12(g_i64) ; LA64-NEXT: ret ; -; LA64-LARGE-LABEL: load_64: +; LA64-LARGE-LABEL: load_i64: ; LA64-LARGE: # %bb.0: # %entry ; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_i64) ; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_i64) @@ -580,36 +580,36 @@ entry: define dso_local void @copy_i32x8() nounwind { ; LA32-LABEL: copy_i32x8: ; LA32: # %bb.0: # %entry -; LA32-NEXT: pcalau12i $a0, %pc_hi20(g_i32x4_src) -; LA32-NEXT: xvld $xr0, $a0, %pc_lo12(g_i32x4_src) -; LA32-NEXT: pcalau12i $a0, %pc_hi20(g_i32x4_dst) -; LA32-NEXT: xvst $xr0, $a0, %pc_lo12(g_i32x4_dst) +; LA32-NEXT: pcalau12i $a0, %pc_hi20(g_i32x8_src) +; LA32-NEXT: xvld $xr0, $a0, %pc_lo12(g_i32x8_src) +; LA32-NEXT: pcalau12i $a0, %pc_hi20(g_i32x8_dst) +; LA32-NEXT: xvst $xr0, $a0, %pc_lo12(g_i32x8_dst) ; LA32-NEXT: ret ; ; LA64-LABEL: copy_i32x8: ; LA64: # %bb.0: # %entry -; LA64-NEXT: pcalau12i $a0, %pc_hi20(g_i32x4_src) -; LA64-NEXT: xvld $xr0, $a0, %pc_lo12(g_i32x4_src) -; LA64-NEXT: pcalau12i $a0, %pc_hi20(g_i32x4_dst) -; LA64-NEXT: xvst $xr0, $a0, %pc_lo12(g_i32x4_dst) +; LA64-NEXT: pcalau12i $a0, %pc_hi20(g_i32x8_src) +; LA64-NEXT: xvld $xr0, $a0, %pc_lo12(g_i32x8_src) +; LA64-NEXT: pcalau12i $a0, %pc_hi20(g_i32x8_dst) +; LA64-NEXT: xvst $xr0, $a0, %pc_lo12(g_i32x8_dst) ; LA64-NEXT: ret ; ; LA64-LARGE-LABEL: copy_i32x8: ; LA64-LARGE: # %bb.0: # %entry -; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_i32x4_src) -; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_i32x4_src) -; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_i32x4_src) -; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_i32x4_src) +; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_i32x8_src) +; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_i32x8_src) +; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_i32x8_src) +; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_i32x8_src) ; LA64-LARGE-NEXT: xvldx $xr0, $a1, $a0 -; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_i32x4_dst) -; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_i32x4_dst) -; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_i32x4_dst) -; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_i32x4_dst) +; LA64-LARGE-NEXT: pcalau12i $a0, %pc_hi20(g_i32x8_dst) +; LA64-LARGE-NEXT: addi.d $a1, $zero, %pc_lo12(g_i32x8_dst) +; LA64-LARGE-NEXT: lu32i.d $a1, %pc64_lo20(g_i32x8_dst) +; LA64-LARGE-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g_i32x8_dst) ; LA64-LARGE-NEXT: xvstx $xr0, $a1, $a0 ; LA64-LARGE-NEXT: ret entry: - %0 = load <8 x i32>, ptr @g_i32x4_src, align 32 - store <8 x i32> %0, ptr @g_i32x4_dst, align 32 + %0 = load <8 x i32>, ptr @g_i32x8_src, align 32 + store <8 x i32> %0, ptr @g_i32x8_dst, align 32 ret void } From 52a02b6d1e0c6b492495ff79a3a06ce93e6180b8 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 10 Feb 2025 04:04:26 -0500 Subject: [PATCH 126/293] [openmp] Fix for 32-bit PowerPC (#126412) --- openmp/runtime/src/kmp_platform.h | 4 ++++ openmp/runtime/src/z_Linux_asm.S | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h index 9c2215140467d..2f47efd3e85ff 100644 --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -116,6 +116,7 @@ #define KMP_ARCH_PPC64_ELFv2 0 #define KMP_ARCH_PPC64_XCOFF 0 #define KMP_ARCH_PPC_XCOFF 0 +#define KMP_ARCH_PPC 0 #define KMP_ARCH_MIPS 0 #define KMP_ARCH_MIPS64 0 #define KMP_ARCH_RISCV64 0 @@ -164,6 +165,9 @@ #define KMP_ARCH_PPC_XCOFF 1 #undef KMP_ARCH_PPC #define KMP_ARCH_PPC 1 +#elif defined(__powerpc__) && !defined(__LP64__) +#undef KMP_ARCH_PPC +#define KMP_ARCH_PPC 1 #elif defined __ARM64_ARCH_8_32__ #undef KMP_ARCH_AARCH64_32 #define KMP_ARCH_AARCH64_32 1 diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index 0bf9f07a13f14..607bfd8e3cb0f 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -2470,7 +2470,7 @@ __kmp_invoke_microtask: #endif /* KMP_ARCH_S390X */ -#if KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32 +#if KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 #ifndef KMP_PREFIX_UNDERSCORE # define KMP_PREFIX_UNDERSCORE(x) x #endif From 7aed53eb1982113e825534f0f66d0a0e46e7a5ed Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 10 Feb 2025 10:07:21 +0100 Subject: [PATCH 127/293] [ScalarEvolution] Handle addrec incoming value in isImpliedViaMerge() (#126236) The code already guards against values coming from a previous iteration using properlyDominates(). However, addrecs are considered to properly dominate the loop they are defined in. Handle this special case separately, by checking for expressions that have computable loop evolution (this should cover cases like a zext of an addrec as well). I considered changing the definition of properlyDominates() instead, but decided against it. The current definition is useful in other context, e.g. when deciding whether an expression is safe to expand in a given block. Fixes https://github.com/llvm/llvm-project/issues/126012. --- llvm/lib/Analysis/ScalarEvolution.cpp | 6 ++++++ llvm/test/Transforms/IndVarSimplify/pr126012.ll | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index f89887118d8d7..46a5c44f4e41a 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -12400,6 +12400,12 @@ bool ScalarEvolution::isImpliedViaMerge(CmpPredicate Pred, const SCEV *LHS, // iteration of a loop. if (!properlyDominates(L, LBB)) return false; + // Addrecs are considered to properly dominate their loop, so are missed + // by the previous check. Discard any values that have computable + // evolution in this loop. + if (auto *Loop = LI.getLoopFor(LBB)) + if (hasComputableLoopEvolution(L, Loop)) + return false; if (!ProvedEasily(L, RHS)) return false; } diff --git a/llvm/test/Transforms/IndVarSimplify/pr126012.ll b/llvm/test/Transforms/IndVarSimplify/pr126012.ll index 725ea89b8e651..5189fe020dd3b 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr126012.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr126012.ll @@ -1,18 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=indvars < %s | FileCheck %s -; FIXME: This is a miscompile. +; Do not infer that %cmp is true. The %indvar3 input of %indvar2 comes from +; a previous iteration, so we should not compare it to a value from the current +; iteration. define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[FOR_PREHEADER:.*]] ; CHECK: [[FOR_PREHEADER]]: ; CHECK-NEXT: [[INDVAR1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[PHI:%.*]], %[[FOR_INC:.*]] ] -; CHECK-NEXT: [[INDVAR3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVAR2:%.*]] = phi i32 [ 1, %[[ENTRY]] ], [ [[INDVAR3:%.*]], %[[FOR_INC]] ] +; CHECK-NEXT: [[INDVAR3]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_INC]] ] ; CHECK-NEXT: [[COND1:%.*]] = icmp eq i32 [[INDVAR3]], 0 ; CHECK-NEXT: br i1 [[COND1]], label %[[FOR_INC]], label %[[FOR_END:.*]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[EXT:%.*]] = zext i1 true to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[INDVAR2]], 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[CMP]] to i32 ; CHECK-NEXT: br label %[[FOR_INC]] ; CHECK: [[FOR_INC]]: ; CHECK-NEXT: [[PHI]] = phi i32 [ [[EXT]], %[[FOR_END]] ], [ 0, %[[FOR_PREHEADER]] ] From b3e74e307ff813abbc32399af31e69114a058212 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 10 Feb 2025 09:09:14 +0000 Subject: [PATCH 128/293] [AArch64] Add SUBHN patterns for xor variant (#126100) `xor x, -1` can be treated as `sub -1, x`, add patterns for generating subhn as opposed to a not. Fixes #123999 --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 91 +++++++++------------ llvm/test/CodeGen/AArch64/arm64-vadd.ll | 18 ++-- 2 files changed, 46 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ce0c260b78410..658ac7490eb33 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6630,60 +6630,43 @@ defm : Neon_addl_extract_patterns; // CodeGen patterns for addhn and subhn instructions, which can actually be // written in LLVM IR without too much difficulty. -// Prioritize ADDHN and SUBHN over UZP2. -let AddedComplexity = 10 in { - -// ADDHN -def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))), - (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm), - (i32 16))))), - (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm), - (i32 32))))), - (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v8i8 V64:$Rd), - (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), - (i32 8))))), - (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v4i16 V64:$Rd), - (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm), - (i32 16))))), - (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v2i32 V64:$Rd), - (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm), - (i32 32))))), - (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; - -// SUBHN -def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))), - (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm), - (i32 16))))), - (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm), - (i32 32))))), - (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v8i8 V64:$Rd), - (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), - (i32 8))))), - (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v4i16 V64:$Rd), - (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm), - (i32 16))))), - (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v2i32 V64:$Rd), - (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm), - (i32 32))))), - (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; - -} // AddedComplexity = 10 +multiclass AddSubHNPatterns { + def : Pat<(VT64 (trunc (VT128 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 Shift))))), + (ADDHN V128:$Rn, V128:$Rm)>; + let AddedComplexity = 10 in + def : Pat<(concat_vectors (VT64 V64:$Rd), + (trunc (VT128 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 Shift))))), + (ADDHN2 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), V128:$Rn, V128:$Rm)>; + + def : Pat<(VT64 (trunc (VT128 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 Shift))))), + (SUBHN V128:$Rn, V128:$Rm)>; + let AddedComplexity = 10 in + def : Pat<(concat_vectors (VT64 V64:$Rd), + (trunc (VT128 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 Shift))))), + (SUBHN2 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), V128:$Rn, V128:$Rm)>; + + // xor by -1 can also be treated as sub + def : Pat<(VT64 (trunc (VT128 (AArch64vlshr (xor V128:$Rn, immAllOnesV:$Rm), (i32 Shift))))), + (SUBHN V128:$Rm, V128:$Rn)>; + let AddedComplexity = 10 in + def : Pat<(concat_vectors (VT64 V64:$Rd), + (trunc (VT128 (AArch64vlshr (xor V128:$Rn, immAllOnesV:$Rm), + (i32 Shift))))), + (SUBHN2 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), V128:$Rm, V128:$Rn)>; +} + +defm : AddSubHNPatterns; +defm : AddSubHNPatterns; +defm : AddSubHNPatterns; //---------------------------------------------------------------------------- // AdvSIMD bitwise extract from vector instruction. diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll index c893138cf7a8c..d982dbbb1f69b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll @@ -1521,9 +1521,9 @@ define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind { define <16 x i8> @neg_narrow_i8(<16 x i16> %a) { ; CHECK-SD-LABEL: neg_narrow_i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mvn v1.16b, v1.16b -; CHECK-SD-NEXT: mvn v0.16b, v0.16b -; CHECK-SD-NEXT: uzp2 v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: movi v2.2d, #0xffffffffffffffff +; CHECK-SD-NEXT: subhn v0.8b, v2.8h, v0.8h +; CHECK-SD-NEXT: subhn2 v0.16b, v2.8h, v1.8h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: neg_narrow_i8: @@ -1542,9 +1542,9 @@ define <16 x i8> @neg_narrow_i8(<16 x i16> %a) { define <8 x i16> @neg_narrow_i16(<8 x i32> %a) { ; CHECK-SD-LABEL: neg_narrow_i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mvn v1.16b, v1.16b -; CHECK-SD-NEXT: mvn v0.16b, v0.16b -; CHECK-SD-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: movi v2.2d, #0xffffffffffffffff +; CHECK-SD-NEXT: subhn v0.4h, v2.4s, v0.4s +; CHECK-SD-NEXT: subhn2 v0.8h, v2.4s, v1.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: neg_narrow_i16: @@ -1563,9 +1563,9 @@ define <8 x i16> @neg_narrow_i16(<8 x i32> %a) { define <4 x i32> @neg_narrow_i32(<4 x i64> %a) { ; CHECK-SD-LABEL: neg_narrow_i32: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mvn v1.16b, v1.16b -; CHECK-SD-NEXT: mvn v0.16b, v0.16b -; CHECK-SD-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: movi v2.2d, #0xffffffffffffffff +; CHECK-SD-NEXT: subhn v0.2s, v2.2d, v0.2d +; CHECK-SD-NEXT: subhn2 v0.4s, v2.2d, v1.2d ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: neg_narrow_i32: From 317a644ae6d501f1a1ec54d17ea8559bcdea35c0 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Mon, 10 Feb 2025 09:13:02 +0000 Subject: [PATCH 129/293] [SDAG] Precommit tests for #126207 (NFC) (#126208) Add missing test coverage for codepaths touched by #126207. --- llvm/test/CodeGen/AArch64/memcpy-f128.ll | 1 + llvm/test/CodeGen/ARM/memcpy-inline.ll | 27 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/llvm/test/CodeGen/AArch64/memcpy-f128.ll b/llvm/test/CodeGen/AArch64/memcpy-f128.ll index 5b354dd23e01d..bb411a749bceb 100644 --- a/llvm/test/CodeGen/AArch64/memcpy-f128.ll +++ b/llvm/test/CodeGen/AArch64/memcpy-f128.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=-neon | FileCheck %s %structA = type { i128 } @stubA = internal unnamed_addr constant %structA zeroinitializer, align 8 diff --git a/llvm/test/CodeGen/ARM/memcpy-inline.ll b/llvm/test/CodeGen/ARM/memcpy-inline.ll index 596a58afe46e5..89db22d1f0ed5 100644 --- a/llvm/test/CodeGen/ARM/memcpy-inline.ll +++ b/llvm/test/CodeGen/ARM/memcpy-inline.ll @@ -12,6 +12,7 @@ @.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1 @.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1 @.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1 +@empty = private unnamed_addr constant [31 x i8] zeroinitializer, align 1 @spool.splbuf = internal global [512 x i8] zeroinitializer, align 16 define i32 @t0() { @@ -282,5 +283,31 @@ entry: ret void } +define void @copy_from_zero_constant(ptr nocapture %C) nounwind { +; CHECK-LABEL: copy_from_zero_constant: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q8, #0x0 +; CHECK-NEXT: movs r1, #15 +; CHECK-NEXT: vst1.8 {d16, d17}, [r0], r1 +; CHECK-NEXT: vst1.8 {d16, d17}, [r0] +; CHECK-NEXT: bx lr +; +; CHECK-T1-LABEL: copy_from_zero_constant: +; CHECK-T1: @ %bb.0: @ %entry +; CHECK-T1-NEXT: .save {r7, lr} +; CHECK-T1-NEXT: push {r7, lr} +; CHECK-T1-NEXT: ldr r1, .LCPI8_0 +; CHECK-T1-NEXT: movs r2, #31 +; CHECK-T1-NEXT: bl __aeabi_memcpy +; CHECK-T1-NEXT: pop {r7, pc} +; CHECK-T1-NEXT: .p2align 2 +; CHECK-T1-NEXT: @ %bb.1: +; CHECK-T1-NEXT: .LCPI8_0: +; CHECK-T1-NEXT: .long .Lempty +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @empty, i64 31, i1 false) + ret void +} + declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture, i32, i1) nounwind declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind From 2d31a12dbe2339d20844ede70cbb54dbaf4ceea9 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 10 Feb 2025 10:34:03 +0100 Subject: [PATCH 130/293] [DSE] Don't use initializes on byval argument (#126259) There are two ways we can fix this problem, depending on how the semantics of byval and initializes should interact: * Don't infer initializes on byval arguments. initializes on byval refers to the original caller memory (or having both attributes is made a verifier error). * Infer initializes on byval, but don't use it in DSE. initializes on byval refers to the callee copy. This matches the semantics of readonly on byval. This is slightly more powerful, for example, we could do a backend optimization where byval + initializes will allocate the full size of byval on the stack but not copy over the parts covered by initializes. I went with the second variant here, skipping byval + initializes in DSE (FunctionAttrs already doesn't propagate initializes past byval). I'm open to going in the other direction though. Fixes https://github.com/llvm/llvm-project/issues/126181. --- llvm/docs/LangRef.rst | 4 ++++ .../lib/Transforms/Scalar/DeadStoreElimination.cpp | 4 +++- .../DeadStoreElimination/inter-procedural.ll | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index f57c29ccdd588..5cdb19fa03fc7 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -1707,6 +1707,10 @@ Currently, only the following parameter attributes are defined: and negative values are allowed in case the argument points partway into an allocation. An empty list is not allowed. + On a ``byval`` argument, ``initializes`` refers to the given parts of the + callee copy being overwritten. A ``byval`` callee can never initialize the + original caller memory passed to the ``byval`` argument. + ``dead_on_unwind`` At a high level, this attribute indicates that the pointer argument is dead if the call unwinds, in the sense that the caller will not depend on the diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 05b4f176bfc31..38454053b039e 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -2283,7 +2283,9 @@ DSEState::getInitializesArgMemLoc(const Instruction *I) { for (unsigned Idx = 0, Count = CB->arg_size(); Idx < Count; ++Idx) { ConstantRangeList Inits; Attribute InitializesAttr = CB->getParamAttr(Idx, Attribute::Initializes); - if (InitializesAttr.isValid()) + // initializes on byval arguments refers to the callee copy, not the + // original memory the caller passed in. + if (InitializesAttr.isValid() && !CB->isByValArgument(Idx)) Inits = InitializesAttr.getValueAsConstantRangeList(); Value *CurArg = CB->getArgOperand(Idx); diff --git a/llvm/test/Transforms/DeadStoreElimination/inter-procedural.ll b/llvm/test/Transforms/DeadStoreElimination/inter-procedural.ll index e590c5bf4004a..5f8ab56c22754 100644 --- a/llvm/test/Transforms/DeadStoreElimination/inter-procedural.ll +++ b/llvm/test/Transforms/DeadStoreElimination/inter-procedural.ll @@ -338,3 +338,17 @@ define i16 @global_var_alias() { ret i16 %l } +declare void @byval_fn(ptr byval(i32) initializes((0, 4)) %am) + +define void @test_byval() { +; CHECK-LABEL: @test_byval( +; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 0, ptr [[A]], align 4 +; CHECK-NEXT: call void @byval_fn(ptr [[A]]) +; CHECK-NEXT: ret void +; + %a = alloca i32 + store i32 0, ptr %a + call void @byval_fn(ptr %a) + ret void +} From 7090dff6fe1e788517be0c49ee8c87d7cfa54b63 Mon Sep 17 00:00:00 2001 From: Amir Bishara <139038766+amirBish@users.noreply.github.com> Date: Mon, 10 Feb 2025 11:35:02 +0200 Subject: [PATCH 131/293] [mlir][scf]: Add value bound for the computed upper bound of for loop (#126426) Add additional bound for the induction variable of the `scf.for` such that: `%iv <= %lower_bound + (%trip_count - 1) * step` --- .../SCF/IR/ValueBoundsOpInterfaceImpl.cpp | 27 +++++++++--- .../SCF/value-bounds-op-interface-impl.mlir | 41 +++++++++++++++++++ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp index 8a27bf186d1c2..410a6bffd345e 100644 --- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp @@ -20,6 +20,16 @@ namespace { struct ForOpInterface : public ValueBoundsOpInterface::ExternalModel { + static AffineExpr getTripCountExpr(scf::ForOp forOp, + ValueBoundsConstraintSet &cstr) { + AffineExpr lbExpr = cstr.getExpr(forOp.getLowerBound()); + AffineExpr ubExpr = cstr.getExpr(forOp.getUpperBound()); + AffineExpr stepExpr = cstr.getExpr(forOp.getStep()); + AffineExpr tripCountExpr = + AffineExpr(ubExpr - lbExpr).ceilDiv(stepExpr); // (ub - lb) / step + return tripCountExpr; + } + /// Populate bounds of values/dimensions for iter_args/OpResults. If the /// value/dimension size does not change in an iteration, we can deduce that /// it the same as the initial value/dimension. @@ -77,11 +87,7 @@ struct ForOpInterface // `value` is result of `forOp`, we can prove that: // %result == %init_arg + trip_count * (%yielded_value - %iter_arg). // Where trip_count is (ub - lb) / step. - AffineExpr lbExpr = cstr.getExpr(forOp.getLowerBound()); - AffineExpr ubExpr = cstr.getExpr(forOp.getUpperBound()); - AffineExpr stepExpr = cstr.getExpr(forOp.getStep()); - AffineExpr tripCountExpr = - AffineExpr(ubExpr - lbExpr).ceilDiv(stepExpr); // (ub - lb) / step + AffineExpr tripCountExpr = getTripCountExpr(forOp, cstr); AffineExpr oneIterAdvanceExpr = cstr.getExpr(yieldedValue) - cstr.getExpr(iterArg); cstr.bound(value) == @@ -93,9 +99,18 @@ struct ForOpInterface auto forOp = cast(op); if (value == forOp.getInductionVar()) { - // TODO: Take into account step size. cstr.bound(value) >= forOp.getLowerBound(); cstr.bound(value) < forOp.getUpperBound(); + // iv <= lb + ((ub-lb)/step - 1) * step + // This bound does not replace the `iv < ub` constraint mentioned above, + // since constraints involving the multiplication of two constraint set + // dimensions are not supported. + AffineExpr tripCountMinusOne = + getTripCountExpr(forOp, cstr) - cstr.getExpr(1); + AffineExpr computedUpperBound = + cstr.getExpr(forOp.getLowerBound()) + + AffineExpr(tripCountMinusOne * cstr.getExpr(forOp.getStep())); + cstr.bound(value) <= computedUpperBound; return; } diff --git a/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir index b48f38f592dc9..339d97df001c5 100644 --- a/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir +++ b/mlir/test/Dialect/SCF/value-bounds-op-interface-impl.mlir @@ -270,6 +270,47 @@ func.func @compare_scf_for(%a: index, %b: index, %c: index) { // ----- +func.func @scf_for_induction_var_upper_bound() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + %c8 = arith.constant 8 : index + %c10 = arith.constant 10 : index + scf.for %iv = %c0 to %c10 step %c4 { + // expected-remark @below{{true}} + "test.compare"(%iv, %c8) {cmp = "LE"} : (index, index) -> () + } + scf.for %iv = %c2 to %c8 step %c3 { + // expected-remark @below{{true}} + "test.compare"(%iv, %c5) {cmp = "LE"} : (index, index) -> () + } + return +} + +// ----- + +#map_ceildiv_dynamic_divisor = affine_map<(i)[s] -> (i ceildiv s)> +func.func @scf_for_induction_var_computed_upper_bound(%upperBound: index, %step: index) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %tripCount = affine.apply #map_ceildiv_dynamic_divisor (%upperBound)[%step] + %tripCountMinusOne = arith.subi %tripCount, %c1 : index + %computedUpperBound = arith.muli %tripCountMinusOne, %step : index + scf.for %iv = %c0 to %upperBound step %step { + // TODO: Value bounds analysis will fail to compute upper bound + // because multiplication/division of unknown block arguments is + // not supported. + // expected-error @below{{unknown}} + "test.compare"(%iv, %computedUpperBound) {cmp = "LE"} : (index, index) -> () + } + return +} + +// ----- + func.func @scf_for_result_infer() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index From 6fd99de31864a5ef84ae8613b3a9034e05293461 Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Mon, 10 Feb 2025 10:58:56 +0100 Subject: [PATCH 132/293] Revert "[LinkerWrapper] Clean up options after proper forwarding" (#126495) Reverts llvm/llvm-project#126297 Broken buildbots https://lab.llvm.org/staging/#/builders/105/builds/15554 https://lab.llvm.org/buildbot/#/builders/30/builds/15490 Error is ``` # .---command stderr------------ # | FileCheck error: '/work/janplehr/git/llvm-project/bot-tester-builds/cmakecachebuild/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/bug51781.c.tmp.custom' is empty. # | FileCheck command line: /home/janplehr/git/llvm-project/bot-tester-builds/cmakecachebuild/./bin/FileCheck /work/janplehr/git/llvm-project/offload/test/offloading/bug51781.c -check-prefix=CUSTOM -input-file=/work/janplehr/git/llvm-project/bot-tester-builds/cmakecachebuild/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/bug51781.c.tmp.custom ``` The file is empty, while the `CUSTOM` check-target expects to find ``` // CUSTOM: Rewriting generic-mode kernel with a customized state machine. ``` --- clang/lib/Driver/ToolChains/Clang.cpp | 11 --------- clang/test/Driver/linker-wrapper.c | 24 +++++++++---------- clang/test/Driver/openmp-offload.c | 10 -------- .../ClangLinkerWrapper.cpp | 23 +++++++++++++++++- .../clang-linker-wrapper/LinkerWrapperOpts.td | 22 +++++++++++++---- 5 files changed, 51 insertions(+), 39 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 82f4cabd620d7..ea376ac00d910 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9252,14 +9252,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, for (StringRef Arg : LinkerArgs) CmdArgs.push_back(Args.MakeArgString( "--device-linker=" + TC->getTripleString() + "=" + Arg)); - - // Forward the LTO mode relying on the Driver's parsing. - if (C.getDriver().getOffloadLTOMode() == LTOK_Full) - CmdArgs.push_back(Args.MakeArgString( - "--device-compiler=" + TC->getTripleString() + "=-flto=full")); - else if (C.getDriver().getOffloadLTOMode() == LTOK_Thin) - CmdArgs.push_back(Args.MakeArgString( - "--device-compiler=" + TC->getTripleString() + "=-flto=thin")); } } @@ -9267,9 +9259,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, Args.MakeArgString("--host-triple=" + getToolChain().getTripleString())); if (Args.hasArg(options::OPT_v)) CmdArgs.push_back("--wrapper-verbose"); - if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ)) - CmdArgs.push_back( - Args.MakeArgString(Twine("--cuda-path=") + A->getValue())); // Construct the link job so we can wrap around it. Linker->ConstructJob(C, JA, Output, Inputs, Args, LinkingOutput); diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index e7b7af7bdfbf3..f416ee5f4463b 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -21,16 +21,16 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK -// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}.o {{.*}}.o +// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out -// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-compiler=-g \ +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O0 \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG -// NVPTX-LINK-DEBUG: clang{{.*}} --target=nvptx64-nvidia-cuda -march=sm_70 {{.*}}-g +// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o -g // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \ @@ -39,16 +39,16 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK -// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \ // RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out -// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-compiler=--save-temps \ +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --save-temps -O2 \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LTO-TEMPS -// AMDGPU-LTO-TEMPS: clang{{.*}} --target=amdgcn-amd-amdhsa -mcpu=gfx1030 {{.*}}-save-temps +// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -flto -Wl,--no-undefined {{.*}}.o -save-temps // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \ @@ -59,7 +59,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: --linker-path=/usr/bin/ld.lld --whole-archive %t.a --no-whole-archive \ // RUN: %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CPU-LINK -// CPU-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu -march=native -Wl,--no-undefined {{.*}}.o {{.*}}.o -Wl,-Bsymbolic -shared -Wl,--whole-archive {{.*}}.a -Wl,--no-whole-archive +// CPU-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu -march=native -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o -Wl,-Bsymbolic -shared -Wl,--whole-archive {{.*}}.a -Wl,--no-whole-archive // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -mllvm -openmp-opt-disable \ @@ -148,7 +148,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --clang-backend \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CLANG-BACKEND -// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -Wl,--no-undefined {{.*}}.o +// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 @@ -171,8 +171,8 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t-on.o %t-off.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMD-TARGET-ID -// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -Wl,--no-undefined {{.*}}.o {{.*}}.o -// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t-lib.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=generic @@ -187,8 +187,8 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t1.o %t2.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL -// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -Wl,--no-undefined {{.*}}.o {{.*}}.o -// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -Wl,--no-undefined {{.*}}.o {{.*}}.o +// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \ diff --git a/clang/test/Driver/openmp-offload.c b/clang/test/Driver/openmp-offload.c index 2cf2643af6c15..6f56ae00ba065 100644 --- a/clang/test/Driver/openmp-offload.c +++ b/clang/test/Driver/openmp-offload.c @@ -208,13 +208,3 @@ // RUN: -fsyntax-only %s 2>&1 | FileCheck -check-prefix=CHK-SYNTAX-ONLY-ARGS %s // CHK-SYNTAX-ONLY-ARGS: "-cc1" "-triple" "powerpc64le-ibm-linux-gnu"{{.*}}"-fsyntax-only" // CHK-SYNTAX-ONLY-ARGS: "-cc1" "-triple" "powerpc64le-unknown-linux"{{.*}}"-fsyntax-only" - -// -// Ensure `-foffload-lto` is forwarded properly. -// -// RUN: %clang -### --target=powerpc64le-linux -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu \ -// RUN: -foffload-lto %s 2>&1 | FileCheck -check-prefix=CHK-DEVICE-LTO-FULL %s -// CHK-DEVICE-LTO-FULL: clang-linker-wrapper{{.*}} "--device-compiler=powerpc64le-ibm-linux-gnu=-flto=full" -// RUN: %clang -### --target=powerpc64le-linux -fopenmp=libomp -fopenmp-targets=powerpc64le-ibm-linux-gnu \ -// RUN: -foffload-lto=thin %s 2>&1 | FileCheck -check-prefix=CHK-DEVICE-LTO-THIN %s -// CHK-DEVICE-LTO-THIN: clang-linker-wrapper{{.*}} "--device-compiler=powerpc64le-ibm-linux-gnu=-flto=thin" diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 1a82a1c59b721..b189cfee674dd 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -485,6 +485,7 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { if (!TempFileOrErr) return TempFileOrErr.takeError(); + StringRef OptLevel = Args.getLastArgValue(OPT_opt_level, "O2"); SmallVector CmdArgs{ *ClangPath, "--no-default-config", @@ -492,9 +493,12 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { *TempFileOrErr, Args.MakeArgString("--target=" + Triple.getTriple()), Triple.isAMDGPU() ? Args.MakeArgString("-mcpu=" + Arch) - : Args.MakeArgString("-march=" + Arch)}; + : Args.MakeArgString("-march=" + Arch), + Args.MakeArgString("-" + OptLevel), + }; // Forward all of the `--offload-opt` and similar options to the device. + CmdArgs.push_back("-flto"); for (auto &Arg : Args.filtered(OPT_offload_opt_eq_minus, OPT_mllvm)) CmdArgs.append( {"-Xlinker", @@ -543,12 +547,29 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args) { CmdArgs.append({"-Xlinker", Args.MakeArgString( "-mllvm=" + StringRef(Arg->getValue()))}); + if (Args.hasArg(OPT_debug)) + CmdArgs.push_back("-g"); + + if (SaveTemps) + CmdArgs.push_back("-save-temps"); + if (SaveTemps && linkerSupportsLTO(Args)) CmdArgs.push_back("-Wl,--save-temps"); if (Args.hasArg(OPT_embed_bitcode)) CmdArgs.push_back("-Wl,--lto-emit-llvm"); + if (Verbose) + CmdArgs.push_back("-v"); + + if (!CudaBinaryPath.empty()) + CmdArgs.push_back(Args.MakeArgString("--cuda-path=" + CudaBinaryPath)); + + for (StringRef Arg : Args.getAllArgValues(OPT_ptxas_arg)) + llvm::copy( + SmallVector({"-Xcuda-ptxas", Args.MakeArgString(Arg)}), + std::back_inserter(CmdArgs)); + for (StringRef Arg : Args.getAllArgValues(OPT_linker_arg_EQ)) CmdArgs.append({"-Xlinker", Args.MakeArgString(Arg)}); for (StringRef Arg : Args.getAllArgValues(OPT_compiler_arg_EQ)) diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index 17fb9db35fe39..57d918db0a73c 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -17,9 +17,11 @@ def cuda_path_EQ : Joined<["--"], "cuda-path=">, Flags<[WrapperOnlyOption]>, MetaVarName<"">, HelpText<"Set the system CUDA path">; def host_triple_EQ : Joined<["--"], "host-triple=">, - Flags<[WrapperOnlyOption]>, - MetaVarName<"">, - HelpText<"Triple to use for the host compilation">; + Flags<[WrapperOnlyOption]>, MetaVarName<"">, + HelpText<"Triple to use for the host compilation">; +def opt_level : Joined<["--"], "opt-level=">, + Flags<[WrapperOnlyOption]>, MetaVarName<"">, + HelpText<"Optimization level for LTO">; def device_linker_args_EQ : Joined<["--"], "device-linker=">, Flags<[WrapperOnlyOption]>, MetaVarName<" or =">, HelpText<"Arguments to pass to the device linker invocation">; @@ -32,8 +34,18 @@ def dry_run : Flag<["--"], "dry-run">, def verbose : Flag<["--"], "wrapper-verbose">, Flags<[WrapperOnlyOption]>, HelpText<"Verbose output from tools">; def embed_bitcode : Flag<["--"], "embed-bitcode">, - Flags<[WrapperOnlyOption]>, - HelpText<"Embed linked bitcode in the module">; + Flags<[WrapperOnlyOption]>, HelpText<"Embed linked bitcode in the module">; +def debug : Flag<["--"], "device-debug">, Flags<[WrapperOnlyOption]>, + HelpText<"Use debugging">; +def ptxas_arg : Joined<["--"], "ptxas-arg=">, + Flags<[WrapperOnlyOption]>, + HelpText<"Argument to pass to the 'ptxas' invocation">; +def pass_remarks_EQ : Joined<["--"], "pass-remarks=">, + Flags<[WrapperOnlyOption]>, HelpText<"Pass remarks for LTO">; +def pass_remarks_missed_EQ : Joined<["--"], "pass-remarks-missed=">, + Flags<[WrapperOnlyOption]>, HelpText<"Pass remarks for LTO">; +def pass_remarks_analysis_EQ : Joined<["--"], "pass-remarks-analysis=">, + Flags<[WrapperOnlyOption]>, HelpText<"Pass remarks for LTO">; def print_wrapped_module : Flag<["--"], "print-wrapped-module">, Flags<[WrapperOnlyOption]>, HelpText<"Print the wrapped module's IR for testing">; From 4dec3909e93c23ef1545c934f9715f9be2d7c49b Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 10 Feb 2025 10:19:22 +0000 Subject: [PATCH 133/293] [libclc] Have all targets build all CLC functions (#124779) This removes all remaining SPIR-V workarounds for CLC functions, in an effort to streamline the CLC implementation and prevent further issues that #124614 had to fix. This commit fixes the same issue for the SPIR-V targets. Target-specific CLC implementations can and will exist, but for now they're all identical and so the target-specific SOURCES files have been removed. Target implementations now always include the 'generic' CLC directory, meaning we can avoid unnecessary duplication of SOURCES listings. --- libclc/CMakeLists.txt | 16 +++++------- libclc/clc/include/clc/integer/clc_abs.h | 7 ----- libclc/clc/include/clc/integer/clc_abs_diff.h | 7 ----- libclc/clc/include/clc/relational/clc_all.h | 7 ----- libclc/clc/include/clc/relational/clc_any.h | 7 ----- .../clc/include/clc/relational/clc_isequal.h | 7 ----- .../clc/include/clc/relational/clc_isfinite.h | 7 ----- .../include/clc/relational/clc_isgreater.h | 7 ----- .../clc/relational/clc_isgreaterequal.h | 7 ----- libclc/clc/include/clc/relational/clc_isinf.h | 7 ----- .../clc/include/clc/relational/clc_isless.h | 7 ----- .../include/clc/relational/clc_islessequal.h | 7 ----- .../clc/relational/clc_islessgreater.h | 7 ----- .../clc/include/clc/relational/clc_isnormal.h | 7 ----- .../include/clc/relational/clc_isnotequal.h | 7 ----- .../include/clc/relational/clc_isordered.h | 7 ----- .../include/clc/relational/clc_isunordered.h | 7 ----- .../clc/include/clc/relational/clc_signbit.h | 7 ----- libclc/clc/include/clc/shared/clc_max.h | 7 ----- libclc/clc/include/clc/shared/clc_min.h | 7 ----- libclc/clc/lib/clspv/SOURCES | 23 ---------------- libclc/clc/lib/spirv/SOURCES | 26 ------------------- 22 files changed, 7 insertions(+), 191 deletions(-) delete mode 100644 libclc/clc/lib/clspv/SOURCES delete mode 100644 libclc/clc/lib/spirv/SOURCES diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 8e3f5097ba84a..b28da904ef68e 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -32,10 +32,6 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS spirv64/lib/SOURCES; # CLC internal libraries clc/lib/generic/SOURCES; - clc/lib/clspv/SOURCES; - clc/lib/clspv64/SOURCES; - clc/lib/spirv/SOURCES; - clc/lib/spirv64/SOURCES; ) set( LIBCLC_MIN_LLVM 3.9.0 ) @@ -266,15 +262,15 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) list( GET TRIPLE 1 VENDOR ) list( GET TRIPLE 2 OS ) - set( dirs ) + set( opencl_dirs ) if ( NOT ${ARCH} STREQUAL spirv AND NOT ${ARCH} STREQUAL spirv64 AND NOT ${ARCH} STREQUAL clspv AND NOT ${ARCH} STREQUAL clspv64) - LIST( APPEND dirs generic ) + LIST( APPEND opencl_dirs generic ) endif() if( ${ARCH} STREQUAL r600 OR ${ARCH} STREQUAL amdgcn ) - list( APPEND dirs amdgpu ) + list( APPEND opencl_dirs amdgpu ) endif() # Some targets' directories alias others @@ -291,11 +287,13 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) endif() set( clc_lib_files ) + set( clc_dirs ${dirs} generic ) + libclc_configure_lib_source( clc_lib_files CLC_INTERNAL LIB_ROOT_DIR clc - DIRS ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} + DIRS ${clc_dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} ) set( opencl_lib_files ) @@ -312,7 +310,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) libclc_configure_lib_source( opencl_lib_files - DIRS ${dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} + DIRS ${opencl_dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} ) foreach( d ${${t}_devices} ) diff --git a/libclc/clc/include/clc/integer/clc_abs.h b/libclc/clc/include/clc/integer/clc_abs.h index 31c62d311a006..59bd807b96060 100644 --- a/libclc/clc/include/clc/integer/clc_abs.h +++ b/libclc/clc/include/clc/integer/clc_abs.h @@ -1,14 +1,7 @@ #ifndef __CLC_INTEGER_CLC_ABS_H__ #define __CLC_INTEGER_CLC_ABS_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible abs -#define __clc_abs abs -#else - #define __CLC_BODY #include -#endif - #endif // __CLC_INTEGER_CLC_ABS_H__ diff --git a/libclc/clc/include/clc/integer/clc_abs_diff.h b/libclc/clc/include/clc/integer/clc_abs_diff.h index 9c33fcff23b79..021a9b6bc45a0 100644 --- a/libclc/clc/include/clc/integer/clc_abs_diff.h +++ b/libclc/clc/include/clc/integer/clc_abs_diff.h @@ -1,14 +1,7 @@ #ifndef __CLC_INTEGER_CLC_ABS_DIFF_H__ #define __CLC_INTEGER_CLC_ABS_DIFF_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible abs_diff -#define __clc_abs_diff abs_diff -#else - #define __CLC_BODY #include -#endif - #endif // __CLC_INTEGER_CLC_ABS_DIFF_H__ diff --git a/libclc/clc/include/clc/relational/clc_all.h b/libclc/clc/include/clc/relational/clc_all.h index 7be3d132dd53d..2ffced19ba0e5 100644 --- a/libclc/clc/include/clc/relational/clc_all.h +++ b/libclc/clc/include/clc/relational/clc_all.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ALL_H__ #define __CLC_RELATIONAL_CLC_ALL_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible all -#define __clc_all all -#else - #include #include @@ -27,6 +22,4 @@ _CLC_VECTOR_ALL_DECL(long) #undef _CLC_ALL_DECL #undef _CLC_VECTOR_ALL_DECL -#endif - #endif // __CLC_RELATIONAL_CLC_ALL_H__ diff --git a/libclc/clc/include/clc/relational/clc_any.h b/libclc/clc/include/clc/relational/clc_any.h index 27dbffeb2eecd..2f554334d9bac 100644 --- a/libclc/clc/include/clc/relational/clc_any.h +++ b/libclc/clc/include/clc/relational/clc_any.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ANY_H__ #define __CLC_RELATIONAL_CLC_ANY_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible any -#define __clc_any any -#else - #include #include @@ -27,6 +22,4 @@ _CLC_VECTOR_ANY_DECL(long) #undef _CLC_ANY_DECL #undef _CLC_VECTOR_ANY_DECL -#endif - #endif // __CLC_RELATIONAL_CLC_ANY_H__ diff --git a/libclc/clc/include/clc/relational/clc_isequal.h b/libclc/clc/include/clc/relational/clc_isequal.h index 0f31fb9530a14..84bf0974dbbf5 100644 --- a/libclc/clc/include/clc/relational/clc_isequal.h +++ b/libclc/clc/include/clc/relational/clc_isequal.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISEQUAL_H__ #define __CLC_RELATIONAL_CLC_ISEQUAL_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isequal -#define __clc_isequal isequal -#else - #include #include @@ -37,6 +32,4 @@ _CLC_VECTOR_ISEQUAL_DECL(half, short) #undef _CLC_ISEQUAL_DECL #undef _CLC_VECTOR_ISEQUAL_DECL -#endif - #endif // __CLC_RELATIONAL_CLC_ISEQUAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_isfinite.h b/libclc/clc/include/clc/relational/clc_isfinite.h index 3ed276e07a2f1..82bcc6ec2da27 100644 --- a/libclc/clc/include/clc/relational/clc_isfinite.h +++ b/libclc/clc/include/clc/relational/clc_isfinite.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISFINITE_H__ #define __CLC_RELATIONAL_CLC_ISFINITE_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isfinite -#define __clc_isfinite isfinite -#else - #define __CLC_FUNCTION __clc_isfinite #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISFINITE_H__ diff --git a/libclc/clc/include/clc/relational/clc_isgreater.h b/libclc/clc/include/clc/relational/clc_isgreater.h index b51d59aeb5499..31961e4c51679 100644 --- a/libclc/clc/include/clc/relational/clc_isgreater.h +++ b/libclc/clc/include/clc/relational/clc_isgreater.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISGREATER_H__ #define __CLC_RELATIONAL_CLC_ISGREATER_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isgreater -#define __clc_isgreater isgreater -#else - #define __CLC_FUNCTION __clc_isgreater #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISGREATER_H__ diff --git a/libclc/clc/include/clc/relational/clc_isgreaterequal.h b/libclc/clc/include/clc/relational/clc_isgreaterequal.h index b7ffce151847f..0e072fad09655 100644 --- a/libclc/clc/include/clc/relational/clc_isgreaterequal.h +++ b/libclc/clc/include/clc/relational/clc_isgreaterequal.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__ #define __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isgreaterequal -#define __clc_isgreaterequal isgreaterequal -#else - #define __CLC_FUNCTION __clc_isgreaterequal #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_isinf.h b/libclc/clc/include/clc/relational/clc_isinf.h index 3f60bec5654a2..b666953d4a8e6 100644 --- a/libclc/clc/include/clc/relational/clc_isinf.h +++ b/libclc/clc/include/clc/relational/clc_isinf.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISINF_H__ #define __CLC_RELATIONAL_CLC_ISINF_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isinf -#define __clc_isinf isinf -#else - #include #include @@ -37,6 +32,4 @@ _CLC_VECTOR_ISINF_DECL(short, half) #undef _CLC_ISINF_DECL #undef _CLC_VECTOR_ISINF_DECL -#endif - #endif // __CLC_RELATIONAL_CLC_ISINF_H__ diff --git a/libclc/clc/include/clc/relational/clc_isless.h b/libclc/clc/include/clc/relational/clc_isless.h index c6950aa61ad90..482fddfe4f8af 100644 --- a/libclc/clc/include/clc/relational/clc_isless.h +++ b/libclc/clc/include/clc/relational/clc_isless.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISLESS_H__ #define __CLC_RELATIONAL_CLC_ISLESS_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isless -#define __clc_isless isless -#else - #define __CLC_FUNCTION __clc_isless #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISLESS_H__ diff --git a/libclc/clc/include/clc/relational/clc_islessequal.h b/libclc/clc/include/clc/relational/clc_islessequal.h index 7efac163e106a..520f3d9c6ffd6 100644 --- a/libclc/clc/include/clc/relational/clc_islessequal.h +++ b/libclc/clc/include/clc/relational/clc_islessequal.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__ #define __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible islessequal -#define __clc_islessequal islessequal -#else - #define __CLC_FUNCTION __clc_islessequal #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_islessgreater.h b/libclc/clc/include/clc/relational/clc_islessgreater.h index df3c5e513c86c..e90eadbbca5e5 100644 --- a/libclc/clc/include/clc/relational/clc_islessgreater.h +++ b/libclc/clc/include/clc/relational/clc_islessgreater.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISLESSGREATER_H__ #define __CLC_RELATIONAL_CLC_ISLESSGREATER_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible islessgreater -#define __clc_islessgreater islessgreater -#else - #define __CLC_FUNCTION __clc_islessgreater #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISLESSGREATER_H__ diff --git a/libclc/clc/include/clc/relational/clc_isnormal.h b/libclc/clc/include/clc/relational/clc_isnormal.h index 48ee6b83a5711..269abf0037411 100644 --- a/libclc/clc/include/clc/relational/clc_isnormal.h +++ b/libclc/clc/include/clc/relational/clc_isnormal.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISNORMAL_H__ #define __CLC_RELATIONAL_CLC_ISNORMAL_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isnormal -#define __clc_isnormal isnormal -#else - #define __CLC_FUNCTION __clc_isnormal #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISNORMAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_isnotequal.h b/libclc/clc/include/clc/relational/clc_isnotequal.h index 55c1bd91b2dd5..598657658ec58 100644 --- a/libclc/clc/include/clc/relational/clc_isnotequal.h +++ b/libclc/clc/include/clc/relational/clc_isnotequal.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__ #define __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isnotequal -#define __clc_isnotequal isnotequal -#else - #define __CLC_FUNCTION __clc_isnotequal #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_isordered.h b/libclc/clc/include/clc/relational/clc_isordered.h index 5ce2bfe334027..f4363d3d8a832 100644 --- a/libclc/clc/include/clc/relational/clc_isordered.h +++ b/libclc/clc/include/clc/relational/clc_isordered.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISORDERED_H__ #define __CLC_RELATIONAL_CLC_ISORDERED_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isordered -#define __clc_isordered isordered -#else - #define __CLC_FUNCTION __clc_isordered #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISORDERED_H__ diff --git a/libclc/clc/include/clc/relational/clc_isunordered.h b/libclc/clc/include/clc/relational/clc_isunordered.h index 305d2b4e9131f..e7f01826d5cc9 100644 --- a/libclc/clc/include/clc/relational/clc_isunordered.h +++ b/libclc/clc/include/clc/relational/clc_isunordered.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_ISUNORDERED_H__ #define __CLC_RELATIONAL_CLC_ISUNORDERED_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible isunordered -#define __clc_isunordered isunordered -#else - #define __CLC_FUNCTION __clc_isunordered #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_ISUNORDERED_H__ diff --git a/libclc/clc/include/clc/relational/clc_signbit.h b/libclc/clc/include/clc/relational/clc_signbit.h index 45a7112c9eb96..55561dd834871 100644 --- a/libclc/clc/include/clc/relational/clc_signbit.h +++ b/libclc/clc/include/clc/relational/clc_signbit.h @@ -1,11 +1,6 @@ #ifndef __CLC_RELATIONAL_CLC_SIGNBIT_H__ #define __CLC_RELATIONAL_CLC_SIGNBIT_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible signbit -#define __clc_signbit signbit -#else - #define __CLC_FUNCTION __clc_signbit #define __CLC_BODY @@ -14,6 +9,4 @@ #undef __CLC_BODY #undef __CLC_FUNCTION -#endif - #endif // __CLC_RELATIONAL_CLC_SIGNBIT_H__ diff --git a/libclc/clc/include/clc/shared/clc_max.h b/libclc/clc/include/clc/shared/clc_max.h index 388f001a27782..9bfa05552a399 100644 --- a/libclc/clc/include/clc/shared/clc_max.h +++ b/libclc/clc/include/clc/shared/clc_max.h @@ -1,17 +1,10 @@ #ifndef __CLC_SHARED_CLC_MAX_H__ #define __CLC_SHARED_CLC_MAX_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible max -#define __clc_max max -#else - #define __CLC_BODY #include #define __CLC_BODY #include -#endif - #endif // __CLC_SHARED_CLC_MAX_H__ diff --git a/libclc/clc/include/clc/shared/clc_min.h b/libclc/clc/include/clc/shared/clc_min.h index c8d920e1b4eb8..a10193885328f 100644 --- a/libclc/clc/include/clc/shared/clc_min.h +++ b/libclc/clc/include/clc/shared/clc_min.h @@ -1,17 +1,10 @@ #ifndef __CLC_SHARED_CLC_MIN_H__ #define __CLC_SHARED_CLC_MIN_H__ -#if defined(CLC_CLSPV) || defined(CLC_SPIRV) -// clspv and spir-v targets provide their own OpenCL-compatible min -#define __clc_min min -#else - #define __CLC_BODY #include #define __CLC_BODY #include -#endif - #endif // __CLC_SHARED_CLC_MIN_H__ diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES deleted file mode 100644 index 2fe07f62a328c..0000000000000 --- a/libclc/clc/lib/clspv/SOURCES +++ /dev/null @@ -1,23 +0,0 @@ -../generic/integer/clc_add_sat.cl -../generic/integer/clc_clz.cl -../generic/integer/clc_hadd.cl -../generic/integer/clc_mad24.cl -../generic/integer/clc_mad_sat.cl -../generic/integer/clc_mul24.cl -../generic/integer/clc_mul_hi.cl -../generic/integer/clc_popcount.cl -../generic/integer/clc_rhadd.cl -../generic/integer/clc_rotate.cl -../generic/integer/clc_sub_sat.cl -../generic/integer/clc_upsample.cl -../generic/math/clc_ceil.cl -../generic/math/clc_copysign.cl -../generic/math/clc_fabs.cl -../generic/math/clc_floor.cl -../generic/math/clc_mad.cl -../generic/math/clc_nextafter.cl -../generic/math/clc_rint.cl -../generic/math/clc_trunc.cl -../generic/relational/clc_isnan.cl -../generic/relational/clc_select.cl -../generic/shared/clc_clamp.cl diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES deleted file mode 100644 index 96040a3aebd83..0000000000000 --- a/libclc/clc/lib/spirv/SOURCES +++ /dev/null @@ -1,26 +0,0 @@ -../generic/common/clc_degrees.cl -../generic/common/clc_radians.cl -../generic/common/clc_smoothstep.cl -../generic/geometric/clc_dot.cl -../generic/integer/clc_add_sat.cl -../generic/integer/clc_clz.cl -../generic/integer/clc_hadd.cl -../generic/integer/clc_mad24.cl -../generic/integer/clc_mad_sat.cl -../generic/integer/clc_mul24.cl -../generic/integer/clc_mul_hi.cl -../generic/integer/clc_popcount.cl -../generic/integer/clc_rhadd.cl -../generic/integer/clc_rotate.cl -../generic/integer/clc_sub_sat.cl -../generic/integer/clc_upsample.cl -../generic/math/clc_ceil.cl -../generic/math/clc_copysign.cl -../generic/math/clc_fabs.cl -../generic/math/clc_floor.cl -../generic/math/clc_mad.cl -../generic/math/clc_nextafter.cl -../generic/math/clc_rint.cl -../generic/math/clc_trunc.cl -../generic/relational/clc_select.cl -../generic/shared/clc_clamp.cl From cab893ab8ebdcf63cfc63666009122d9c0e31bdf Mon Sep 17 00:00:00 2001 From: Aniket Lal Date: Mon, 10 Feb 2025 15:54:13 +0530 Subject: [PATCH 134/293] [Clang][Driver][HIP] Do not specify explicit target cpu in host compilation run line (#126488) This PR fixes the post merge check fails from PR https://github.com/llvm/llvm-project/pull/125646 Co-authored-by: anikelal --- clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip b/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip index d26faf7242f91..79a52f0bc8981 100644 --- a/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip +++ b/clang/test/Driver/dep-file-flag-with-multiple-offload-archs.hip @@ -8,6 +8,6 @@ // CHECK-NOT: {{.*}}clang{{.*}}"-target-cpu" "gfx1101"{{.*}}"-dependency-file" "tmp.d" // CHECK: {{.*}}lld{{.*}}"-plugin-opt=mcpu=gfx1101" // CHECK: {{.*}}clang-offload-bundler -// CHECK: {{.*}}clang{{.*}}"-target-cpu" "x86-64"{{.*}}"-dependency-file" "tmp.d" +// CHECK: {{.*}}clang{{.*}}"-target-cpu"{{.*}}"-dependency-file" "tmp.d" void main(){} From f845497f3b2e9b8660cfd33177c8e8a2ce1b8fc0 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 10 Feb 2025 10:32:45 +0000 Subject: [PATCH 135/293] [llvm][Docs] Explain how to handle excessive formatting changes (#126239) Based on some feedback in Discord about a PR where a reviewer asked the author to move the formatting changes to a new PR, which appears to contradict the current form of this document. I've added an explanation here, before the point where the author would be committing any of the formatting changes. There are other ways this can go, for example some projects don't want the churn of formatting, or you can pre-emptively send a formatting PR, but I don't think enumerating them all here will help the audience for this text. So I've recomended one path that will start them off well, and can branch off if the reviewers make requests. --- llvm/docs/Contributing.rst | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/llvm/docs/Contributing.rst b/llvm/docs/Contributing.rst index 9311f39b6e697..5ee07fcec5bf9 100644 --- a/llvm/docs/Contributing.rst +++ b/llvm/docs/Contributing.rst @@ -73,15 +73,33 @@ recent commit: % git clang-format HEAD~1 -Note that this modifies the files, but doesn't commit them -- you'll likely want -to run +.. note:: + For some patches, formatting them may add changes that obscure the intent of + the patch. For example, adding to an enum that was not previously formatted + may result in the entire enum being reformatted. This happens because not all + of the LLVM Project conforms to LLVM's clang-format style at this time. + + If you think that this might be the case for your changes, or are unsure, we + recommend that you add the formatting changes as a **separate commit** within + the Pull Request. + + Reviewers may request that this formatting commit be made into a separate Pull + Request that will be merged before your actual changes. + + This means that if the formatting changes are the first commit, you will have + an easier time doing this. If they are not, that is ok too, but you will have + to do a bit more work to separate it out. + +Note that ``git clang-format`` modifies the files, but does not commit them -- +you will likely want to run one of the following to add the changes to a commit: .. code-block:: console + # To create a new commit. + % git commit -a + # To add to the most recent commit. % git commit --amend -a -in order to update the last commit with all pending changes. - .. note:: If you don't already have ``clang-format`` or ``git clang-format`` installed on your system, the ``clang-format`` binary will be built alongside clang, and From d9183fd96ef2e87b8c59b26956316a97fece0c84 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 10 Feb 2025 11:24:04 +0000 Subject: [PATCH 136/293] [X86] LowerSelect - use BLENDV for scalar selection on all SSE41+ targets (#125853) When we first began (2015) to lower f32/f64 selects to X86ISD::BLENDV(scalar_to_vector(),scalar_to_vector(),scalar_to_vector()), we limited it to AVX targets to avoid issues with SSE41's xmm0 constraint for the condition mask. Since then we've seen general improvements in TwoAddressInstruction and better handling of condition commutation for X86ISD::BLENDV nodes, which should address many of the original concerns of using SSE41 BLENDVPD/S. In most cases we will replace 3 logic instruction with the BLENDV node and (up to 3) additional moves. Although the BLENDV is often more expensive on original SSE41 targets, this should still be an improvement in a majority of cases. We also have no equivalent restrictions for SSE41 for v2f64/v4f32 vector selection. Fixes #105807 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +- llvm/test/CodeGen/X86/fmaxnum.ll | 124 +++-- llvm/test/CodeGen/X86/fminnum.ll | 124 +++-- llvm/test/CodeGen/X86/fp-select-cmp-and.ll | 10 +- llvm/test/CodeGen/X86/setcc-combine.ll | 56 ++- llvm/test/CodeGen/X86/sse-minmax.ll | 144 +++--- llvm/test/CodeGen/X86/vec_floor.ll | 28 +- llvm/test/CodeGen/X86/vector-reduce-fmax.ll | 258 +++++----- .../CodeGen/X86/vector-reduce-fmaximum.ll | 467 +++++++++--------- llvm/test/CodeGen/X86/vector-reduce-fmin.ll | 249 +++++----- llvm/test/CodeGen/X86/vselect-zero.ll | 47 +- 11 files changed, 782 insertions(+), 738 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9a916a663a64c..4e7ee83d00488 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24648,19 +24648,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getTargetConstant(SSECC, DL, MVT::i8)); - // If we have AVX, we can use a variable vector select (VBLENDV) instead - // of 3 logic instructions for size savings and potentially speed. + // If we have SSE41/AVX, we can use a variable vector select (VBLENDV) + // instead of 3 logic instructions for size savings and potentially speed. // Unfortunately, there is no scalar form of VBLENDV. - + // // If either operand is a +0.0 constant, don't try this. We can expect to // optimize away at least one of the logic instructions later in that // case, so that sequence would be faster than a variable blend. - - // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly - // uses XMM0 as the selection register. That may need just as many - // instructions as the AND/ANDN/OR sequence due to register moves, so - // don't bother. - if (Subtarget.hasAVX() && !isNullFPConstant(Op1) && + if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) && !isNullFPConstant(Op2)) { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll index 2e1af1e84e076..d6252cc85e8b4 100644 --- a/llvm/test/CodeGen/X86/fmaxnum.ll +++ b/llvm/test/CodeGen/X86/fmaxnum.ll @@ -22,17 +22,26 @@ declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) ; FIXME: As the vector tests show, the SSE run shouldn't need this many moves. define float @test_fmaxf(float %x, float %y) { -; SSE-LABEL: test_fmaxf: -; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpunordss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: maxss %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_fmaxf: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: test_fmaxf: +; SSE4: # %bb.0: +; SSE4-NEXT: movaps %xmm1, %xmm2 +; SSE4-NEXT: maxss %xmm0, %xmm2 +; SSE4-NEXT: cmpunordss %xmm0, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movaps %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: test_fmaxf: ; AVX1: # %bb.0: @@ -63,17 +72,26 @@ define float @test_fmaxf_minsize(float %x, float %y) minsize { ; FIXME: As the vector tests show, the SSE run shouldn't need this many moves. define double @test_fmax(double %x, double %y) { -; SSE-LABEL: test_fmax: -; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: cmpunordsd %xmm0, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: andpd %xmm1, %xmm3 -; SSE-NEXT: maxsd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm1, %xmm2 -; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_fmax: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm3, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: test_fmax: +; SSE4: # %bb.0: +; SSE4-NEXT: movapd %xmm1, %xmm2 +; SSE4-NEXT: maxsd %xmm0, %xmm2 +; SSE4-NEXT: cmpunordsd %xmm0, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movapd %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: test_fmax: ; AVX1: # %bb.0: @@ -111,17 +129,26 @@ define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) { } define float @test_intrinsic_fmaxf(float %x, float %y) { -; SSE-LABEL: test_intrinsic_fmaxf: -; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpunordss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: maxss %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_intrinsic_fmaxf: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: maxss %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: test_intrinsic_fmaxf: +; SSE4: # %bb.0: +; SSE4-NEXT: movaps %xmm1, %xmm2 +; SSE4-NEXT: maxss %xmm0, %xmm2 +; SSE4-NEXT: cmpunordss %xmm0, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movaps %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: test_intrinsic_fmaxf: ; AVX1: # %bb.0: @@ -142,17 +169,26 @@ define float @test_intrinsic_fmaxf(float %x, float %y) { } define double @test_intrinsic_fmax(double %x, double %y) { -; SSE-LABEL: test_intrinsic_fmax: -; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: cmpunordsd %xmm0, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: andpd %xmm1, %xmm3 -; SSE-NEXT: maxsd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm1, %xmm2 -; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_intrinsic_fmax: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm3, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: test_intrinsic_fmax: +; SSE4: # %bb.0: +; SSE4-NEXT: movapd %xmm1, %xmm2 +; SSE4-NEXT: maxsd %xmm0, %xmm2 +; SSE4-NEXT: cmpunordsd %xmm0, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movapd %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: test_intrinsic_fmax: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll index 1290a7b819106..0ef8fdec33d93 100644 --- a/llvm/test/CodeGen/X86/fminnum.ll +++ b/llvm/test/CodeGen/X86/fminnum.ll @@ -22,17 +22,26 @@ declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) ; FIXME: As the vector tests show, the SSE run shouldn't need this many moves. define float @test_fminf(float %x, float %y) { -; SSE-LABEL: test_fminf: -; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpunordss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: minss %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_fminf: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: test_fminf: +; SSE4: # %bb.0: +; SSE4-NEXT: movaps %xmm1, %xmm2 +; SSE4-NEXT: minss %xmm0, %xmm2 +; SSE4-NEXT: cmpunordss %xmm0, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movaps %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: test_fminf: ; AVX1: # %bb.0: @@ -63,17 +72,26 @@ define float @test_fminf_minsize(float %x, float %y) minsize { ; FIXME: As the vector tests show, the SSE run shouldn't need this many moves. define double @test_fmin(double %x, double %y) { -; SSE-LABEL: test_fmin: -; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: cmpunordsd %xmm0, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: andpd %xmm1, %xmm3 -; SSE-NEXT: minsd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm1, %xmm2 -; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_fmin: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm3, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: test_fmin: +; SSE4: # %bb.0: +; SSE4-NEXT: movapd %xmm1, %xmm2 +; SSE4-NEXT: minsd %xmm0, %xmm2 +; SSE4-NEXT: cmpunordsd %xmm0, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movapd %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: test_fmin: ; AVX1: # %bb.0: @@ -111,17 +129,26 @@ define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) { } define float @test_intrinsic_fminf(float %x, float %y) { -; SSE-LABEL: test_intrinsic_fminf: -; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpunordss %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: minss %xmm0, %xmm1 -; SSE-NEXT: andnps %xmm1, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_intrinsic_fminf: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: cmpunordss %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: andps %xmm1, %xmm3 +; SSE2-NEXT: minss %xmm0, %xmm1 +; SSE2-NEXT: andnps %xmm1, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: test_intrinsic_fminf: +; SSE4: # %bb.0: +; SSE4-NEXT: movaps %xmm1, %xmm2 +; SSE4-NEXT: minss %xmm0, %xmm2 +; SSE4-NEXT: cmpunordss %xmm0, %xmm0 +; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movaps %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: test_intrinsic_fminf: ; AVX1: # %bb.0: @@ -142,17 +169,26 @@ define float @test_intrinsic_fminf(float %x, float %y) { } define double @test_intrinsic_fmin(double %x, double %y) { -; SSE-LABEL: test_intrinsic_fmin: -; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: cmpunordsd %xmm0, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm3 -; SSE-NEXT: andpd %xmm1, %xmm3 -; SSE-NEXT: minsd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm1, %xmm2 -; SSE-NEXT: orpd %xmm3, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_intrinsic_fmin: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm3 +; SSE2-NEXT: andpd %xmm1, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm3, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: test_intrinsic_fmin: +; SSE4: # %bb.0: +; SSE4-NEXT: movapd %xmm1, %xmm2 +; SSE4-NEXT: minsd %xmm0, %xmm2 +; SSE4-NEXT: cmpunordsd %xmm0, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE4-NEXT: movapd %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1-LABEL: test_intrinsic_fmin: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/fp-select-cmp-and.ll b/llvm/test/CodeGen/X86/fp-select-cmp-and.ll index 0f6159d36ea81..1d006f725ca34 100644 --- a/llvm/test/CodeGen/X86/fp-select-cmp-and.ll +++ b/llvm/test/CodeGen/X86/fp-select-cmp-and.ll @@ -189,10 +189,9 @@ define float @test17(float %a, float %b, float %c, float %eps) { ; CHECK-LABEL: test17: ; CHECK: # %bb.0: ; CHECK-NEXT: cmpless %xmm0, %xmm3 -; CHECK-NEXT: andps %xmm3, %xmm2 -; CHECK-NEXT: andnps %xmm1, %xmm3 -; CHECK-NEXT: orps %xmm2, %xmm3 ; CHECK-NEXT: movaps %xmm3, %xmm0 +; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retq %cmp = fcmp oge float %a, %eps %cond = select i1 %cmp, float %c, float %b @@ -203,10 +202,9 @@ define double @test18(double %a, double %b, double %c, double %eps) { ; CHECK-LABEL: test18: ; CHECK: # %bb.0: ; CHECK-NEXT: cmplesd %xmm0, %xmm3 -; CHECK-NEXT: andpd %xmm3, %xmm2 -; CHECK-NEXT: andnpd %xmm1, %xmm3 -; CHECK-NEXT: orpd %xmm2, %xmm3 ; CHECK-NEXT: movapd %xmm3, %xmm0 +; CHECK-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: retq %cmp = fcmp oge double %a, %eps %cond = select i1 %cmp, double %c, double %b diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll index e723569bda8a1..f526db00df606 100644 --- a/llvm/test/CodeGen/X86/setcc-combine.ll +++ b/llvm/test/CodeGen/X86/setcc-combine.ll @@ -463,14 +463,23 @@ define <2 x double> @oge(<2 x double> %x) { ; negative test - don't create an fneg to replace 0.0 operand define double @ogt_no_fneg(double %x, double %y) { -; CHECK-LABEL: ogt_no_fneg: -; CHECK: # %bb.0: -; CHECK-NEXT: xorpd %xmm2, %xmm2 -; CHECK-NEXT: cmpltsd %xmm0, %xmm2 -; CHECK-NEXT: andpd %xmm2, %xmm0 -; CHECK-NEXT: andnpd %xmm1, %xmm2 -; CHECK-NEXT: orpd %xmm2, %xmm0 -; CHECK-NEXT: retq +; SSE2-LABEL: ogt_no_fneg: +; SSE2: # %bb.0: +; SSE2-NEXT: xorpd %xmm2, %xmm2 +; SSE2-NEXT: cmpltsd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: ogt_no_fneg: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: xorpd %xmm0, %xmm0 +; SSE41-NEXT: cmpltsd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq %cmp = fcmp ogt double %x, 0.0 %r = select i1 %cmp, double %x, double %y ret double %r @@ -479,16 +488,27 @@ define double @ogt_no_fneg(double %x, double %y) { ; negative test - can't change the setcc for non-zero constant define double @ogt_no_zero(double %x) { -; CHECK-LABEL: ogt_no_zero: -; CHECK: # %bb.0: -; CHECK-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] -; CHECK-NEXT: xorpd %xmm0, %xmm1 -; CHECK-NEXT: movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0] -; CHECK-NEXT: cmpltsd %xmm0, %xmm2 -; CHECK-NEXT: andpd %xmm2, %xmm0 -; CHECK-NEXT: andnpd %xmm1, %xmm2 -; CHECK-NEXT: orpd %xmm2, %xmm0 -; CHECK-NEXT: retq +; SSE2-LABEL: ogt_no_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; SSE2-NEXT: xorpd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0] +; SSE2-NEXT: cmpltsd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: ogt_no_zero: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm0, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; SSE41-NEXT: xorpd %xmm0, %xmm2 +; SSE41-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0] +; SSE41-NEXT: cmpltsd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: retq %neg = fneg double %x %cmp = fcmp ogt double %x, 1.0 %r = select i1 %cmp, double %x, double %neg diff --git a/llvm/test/CodeGen/X86/sse-minmax.ll b/llvm/test/CodeGen/X86/sse-minmax.ll index 1c14b7400a358..7904b21a3b1fa 100644 --- a/llvm/test/CodeGen/X86/sse-minmax.ll +++ b/llvm/test/CodeGen/X86/sse-minmax.ll @@ -80,11 +80,11 @@ define double @olt_inverse(double %x, double %y) { define double @oge(double %x, double %y) { ; STRICT-LABEL: oge: ; STRICT: # %bb.0: -; STRICT-NEXT: movapd %xmm1, %xmm2 -; STRICT-NEXT: cmplesd %xmm0, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm0 -; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm2, %xmm0 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: cmplesd %xmm2, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; STRICT-NEXT: movapd %xmm1, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: oge: @@ -100,10 +100,9 @@ define double @ole(double %x, double %y) { ; STRICT-LABEL: ole: ; STRICT: # %bb.0: ; STRICT-NEXT: movapd %xmm0, %xmm2 -; STRICT-NEXT: cmplesd %xmm1, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm0 -; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm2, %xmm0 +; STRICT-NEXT: cmplesd %xmm1, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; STRICT-NEXT: movapd %xmm1, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole: @@ -118,11 +117,10 @@ define double @ole(double %x, double %y) { define double @oge_inverse(double %x, double %y) { ; STRICT-LABEL: oge_inverse: ; STRICT: # %bb.0: -; STRICT-NEXT: movapd %xmm1, %xmm2 -; STRICT-NEXT: cmplesd %xmm0, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm1 -; STRICT-NEXT: andnpd %xmm0, %xmm2 -; STRICT-NEXT: orpd %xmm1, %xmm2 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: cmplesd %xmm2, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; STRICT-NEXT: movapd %xmm2, %xmm0 ; STRICT-NEXT: retq ; @@ -145,10 +143,8 @@ define double @ole_inverse(double %x, double %y) { ; STRICT-LABEL: ole_inverse: ; STRICT: # %bb.0: ; STRICT-NEXT: movapd %xmm0, %xmm2 -; STRICT-NEXT: cmplesd %xmm1, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm1 -; STRICT-NEXT: andnpd %xmm0, %xmm2 -; STRICT-NEXT: orpd %xmm1, %xmm2 +; STRICT-NEXT: cmplesd %xmm1, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; STRICT-NEXT: movapd %xmm2, %xmm0 ; STRICT-NEXT: retq ; @@ -333,10 +329,9 @@ define double @ugt(double %x, double %y) { ; STRICT-LABEL: ugt: ; STRICT: # %bb.0: ; STRICT-NEXT: movapd %xmm0, %xmm2 -; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm0 -; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm2, %xmm0 +; STRICT-NEXT: cmpnlesd %xmm1, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; STRICT-NEXT: movapd %xmm1, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt: @@ -351,11 +346,11 @@ define double @ugt(double %x, double %y) { define double @ult(double %x, double %y) { ; STRICT-LABEL: ult: ; STRICT: # %bb.0: -; STRICT-NEXT: movapd %xmm1, %xmm2 -; STRICT-NEXT: cmpnlesd %xmm0, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm0 -; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm2, %xmm0 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: cmpnlesd %xmm2, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; STRICT-NEXT: movapd %xmm1, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ult: @@ -371,10 +366,8 @@ define double @ugt_inverse(double %x, double %y) { ; STRICT-LABEL: ugt_inverse: ; STRICT: # %bb.0: ; STRICT-NEXT: movapd %xmm0, %xmm2 -; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm1 -; STRICT-NEXT: andnpd %xmm0, %xmm2 -; STRICT-NEXT: orpd %xmm1, %xmm2 +; STRICT-NEXT: cmpnlesd %xmm1, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; STRICT-NEXT: movapd %xmm2, %xmm0 ; STRICT-NEXT: retq ; @@ -396,11 +389,10 @@ define double @ugt_inverse(double %x, double %y) { define double @ult_inverse(double %x, double %y) { ; STRICT-LABEL: ult_inverse: ; STRICT: # %bb.0: -; STRICT-NEXT: movapd %xmm1, %xmm2 -; STRICT-NEXT: cmpnlesd %xmm0, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm1 -; STRICT-NEXT: andnpd %xmm0, %xmm2 -; STRICT-NEXT: orpd %xmm1, %xmm2 +; STRICT-NEXT: movapd %xmm0, %xmm2 +; STRICT-NEXT: movapd %xmm1, %xmm0 +; STRICT-NEXT: cmpnlesd %xmm2, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; STRICT-NEXT: movapd %xmm2, %xmm0 ; STRICT-NEXT: retq ; @@ -738,12 +730,12 @@ define double @olt_inverse_y(double %x) { define double @oge_y(double %x) { ; STRICT-LABEL: oge_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] -; STRICT-NEXT: movapd %xmm1, %xmm2 -; STRICT-NEXT: cmplesd %xmm0, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm0 -; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm2, %xmm0 +; STRICT-NEXT: movapd %xmm0, %xmm1 +; STRICT-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] +; STRICT-NEXT: cmplesd %xmm1, %xmm0 +; STRICT-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; STRICT-NEXT: movapd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: oge_y: @@ -758,12 +750,11 @@ define double @oge_y(double %x) { define double @ole_y(double %x) { ; STRICT-LABEL: ole_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] -; STRICT-NEXT: movapd %xmm0, %xmm2 -; STRICT-NEXT: cmplesd %xmm1, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm0 -; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm2, %xmm0 +; STRICT-NEXT: movapd %xmm0, %xmm1 +; STRICT-NEXT: cmplesd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; STRICT-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; STRICT-NEXT: movapd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ole_y: @@ -778,12 +769,10 @@ define double @ole_y(double %x) { define double @oge_inverse_y(double %x) { ; STRICT-LABEL: oge_inverse_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0] -; STRICT-NEXT: movapd %xmm2, %xmm1 -; STRICT-NEXT: cmplesd %xmm0, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm2 -; STRICT-NEXT: andnpd %xmm0, %xmm1 -; STRICT-NEXT: orpd %xmm2, %xmm1 +; STRICT-NEXT: movapd %xmm0, %xmm1 +; STRICT-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] +; STRICT-NEXT: cmplesd %xmm1, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; STRICT-NEXT: movapd %xmm1, %xmm0 ; STRICT-NEXT: retq ; @@ -806,12 +795,9 @@ define double @oge_inverse_y(double %x) { define double @ole_inverse_y(double %x) { ; STRICT-LABEL: ole_inverse_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0] ; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmplesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm2 -; STRICT-NEXT: andnpd %xmm0, %xmm1 -; STRICT-NEXT: orpd %xmm2, %xmm1 +; STRICT-NEXT: cmplesd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; STRICT-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; STRICT-NEXT: movapd %xmm1, %xmm0 ; STRICT-NEXT: retq ; @@ -834,12 +820,11 @@ define double @ole_inverse_y(double %x) { define double @ugt_y(double %x) { ; STRICT-LABEL: ugt_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] -; STRICT-NEXT: movapd %xmm0, %xmm2 -; STRICT-NEXT: cmpnlesd %xmm1, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm0 -; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm2, %xmm0 +; STRICT-NEXT: movapd %xmm0, %xmm1 +; STRICT-NEXT: cmpnlesd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; STRICT-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; STRICT-NEXT: movapd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ugt_y: @@ -854,12 +839,12 @@ define double @ugt_y(double %x) { define double @ult_y(double %x) { ; STRICT-LABEL: ult_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] -; STRICT-NEXT: movapd %xmm1, %xmm2 -; STRICT-NEXT: cmpnlesd %xmm0, %xmm2 -; STRICT-NEXT: andpd %xmm2, %xmm0 -; STRICT-NEXT: andnpd %xmm1, %xmm2 -; STRICT-NEXT: orpd %xmm2, %xmm0 +; STRICT-NEXT: movapd %xmm0, %xmm1 +; STRICT-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] +; STRICT-NEXT: cmpnlesd %xmm1, %xmm0 +; STRICT-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2 +; STRICT-NEXT: movapd %xmm2, %xmm0 ; STRICT-NEXT: retq ; ; RELAX-LABEL: ult_y: @@ -874,12 +859,9 @@ define double @ult_y(double %x) { define double @ugt_inverse_y(double %x) { ; STRICT-LABEL: ugt_inverse_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0] ; STRICT-NEXT: movapd %xmm0, %xmm1 -; STRICT-NEXT: cmpnlesd %xmm2, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm2 -; STRICT-NEXT: andnpd %xmm0, %xmm1 -; STRICT-NEXT: orpd %xmm2, %xmm1 +; STRICT-NEXT: cmpnlesd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; STRICT-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; STRICT-NEXT: movapd %xmm1, %xmm0 ; STRICT-NEXT: retq ; @@ -902,12 +884,10 @@ define double @ugt_inverse_y(double %x) { define double @ult_inverse_y(double %x) { ; STRICT-LABEL: ult_inverse_y: ; STRICT: # %bb.0: -; STRICT-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0] -; STRICT-NEXT: movapd %xmm2, %xmm1 -; STRICT-NEXT: cmpnlesd %xmm0, %xmm1 -; STRICT-NEXT: andpd %xmm1, %xmm2 -; STRICT-NEXT: andnpd %xmm0, %xmm1 -; STRICT-NEXT: orpd %xmm2, %xmm1 +; STRICT-NEXT: movapd %xmm0, %xmm1 +; STRICT-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] +; STRICT-NEXT: cmpnlesd %xmm1, %xmm0 +; STRICT-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; STRICT-NEXT: movapd %xmm1, %xmm0 ; STRICT-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll index 65cde6ac91106..abb85ac83464c 100644 --- a/llvm/test/CodeGen/X86/vec_floor.ll +++ b/llvm/test/CodeGen/X86/vec_floor.ll @@ -1679,10 +1679,9 @@ define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x flo ; SSE41: ## %bb.0: ; SSE41-NEXT: roundss $9, %xmm0, %xmm3 ; SSE41-NEXT: cmpeqss %xmm1, %xmm0 -; SSE41-NEXT: andps %xmm0, %xmm3 -; SSE41-NEXT: andnps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_mask_ss_mask8: @@ -1747,10 +1746,9 @@ define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x ; SSE41: ## %bb.0: ; SSE41-NEXT: roundsd $9, %xmm0, %xmm3 ; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 -; SSE41-NEXT: andpd %xmm0, %xmm3 -; SSE41-NEXT: andnpd %xmm2, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_mask_sd_mask8: @@ -2671,10 +2669,9 @@ define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x floa ; SSE41: ## %bb.0: ; SSE41-NEXT: roundss $10, %xmm0, %xmm3 ; SSE41-NEXT: cmpeqss %xmm1, %xmm0 -; SSE41-NEXT: andps %xmm0, %xmm3 -; SSE41-NEXT: andnps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3] +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_mask_ss_mask8: @@ -2739,10 +2736,9 @@ define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x d ; SSE41: ## %bb.0: ; SSE41-NEXT: roundsd $10, %xmm0, %xmm3 ; SSE41-NEXT: cmpeqsd %xmm1, %xmm0 -; SSE41-NEXT: andpd %xmm0, %xmm3 -; SSE41-NEXT: andnpd %xmm2, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_mask_sd_mask8: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll index fe2c41f57cfab..7048b98227620 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW @@ -36,13 +36,10 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: andps %xmm2, %xmm3 -; SSE41-NEXT: maxss %xmm0, %xmm2 -; SSE41-NEXT: andnps %xmm2, %xmm1 -; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -89,21 +86,19 @@ define float @test_v3f32(<3 x float> %a0) { ; ; SSE41-LABEL: test_v3f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: cmpunordss %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm3 -; SSE41-NEXT: maxss %xmm0, %xmm1 -; SSE41-NEXT: andnps %xmm1, %xmm2 -; SSE41-NEXT: orps %xmm3, %xmm2 -; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm2, %xmm3 ; SSE41-NEXT: cmpunordss %xmm2, %xmm2 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: andnps %xmm1, %xmm2 -; SSE41-NEXT: orps %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movaps %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v3f32: @@ -166,31 +161,26 @@ define float @test_v4f32(<4 x float> %a0) { ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: maxss %xmm1, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: andnps %xmm3, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: maxss %xmm0, %xmm3 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: maxss %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxss %xmm0, %xmm1 ; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm4, %xmm2 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: cmpunordss %xmm2, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: @@ -266,35 +256,30 @@ define float @test_v8f32(<8 x float> %a0) { ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: maxps %xmm0, %xmm2 -; SSE41-NEXT: cmpunordps %xmm0, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: cmpunordss %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm3 -; SSE41-NEXT: maxss %xmm2, %xmm1 -; SSE41-NEXT: andnps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: maxss %xmm0, %xmm3 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: maxps %xmm0, %xmm3 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: maxss %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andnps %xmm1, %xmm3 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: maxss %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: cmpunordss %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm3, %xmm4 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE41-NEXT: movaps %xmm4, %xmm2 +; SSE41-NEXT: maxss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: cmpunordss %xmm2, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: @@ -458,36 +443,31 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: cmpunordps %xmm1, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: maxps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: maxps %xmm4, %xmm3 ; SSE41-NEXT: cmpunordps %xmm4, %xmm4 ; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxss %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: cmpunordss %xmm3, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: cmpunordss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: movaps %xmm3, %xmm4 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE41-NEXT: movaps %xmm4, %xmm2 ; SSE41-NEXT: maxss %xmm1, %xmm2 -; SSE41-NEXT: andnps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: maxss %xmm0, %xmm3 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: maxss %xmm0, %xmm2 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: maxss %xmm2, %xmm1 +; SSE41-NEXT: cmpunordss %xmm2, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: @@ -664,19 +644,30 @@ define float @test_v16f32(<16 x float> %a0) { ; define double @test_v2f64(<2 x double> %a0) { -; SSE-LABEL: test_v2f64: -; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: cmpunordsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: andpd %xmm2, %xmm3 -; SSE-NEXT: maxsd %xmm0, %xmm2 -; SSE-NEXT: andnpd %xmm2, %xmm1 -; SSE-NEXT: orpd %xmm3, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: maxsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: maxsd %xmm0, %xmm1 +; SSE41-NEXT: cmpunordsd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: @@ -724,15 +715,14 @@ define double @test_v4f64(<4 x double> %a0) { ; SSE41-NEXT: maxpd %xmm0, %xmm2 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: andpd %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: maxsd %xmm2, %xmm1 -; SSE41-NEXT: andnpd %xmm1, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: @@ -820,15 +810,14 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: maxsd %xmm1, %xmm2 -; SSE41-NEXT: andnpd %xmm2, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: @@ -1012,15 +1001,14 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: maxsd %xmm1, %xmm2 -; SSE41-NEXT: andnpd %xmm2, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll index ec41657d2f248..008e3e4c217cb 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll @@ -49,22 +49,19 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: testl %eax, %eax +; SSE41-NEXT: js .LBB1_1 +; SSE41-NEXT: # %bb.2: ; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: js .LBB1_2 -; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: jmp .LBB1_3 +; SSE41-NEXT: .LBB1_1: ; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: .LBB1_2: -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: cmpunordss %xmm3, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 -; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: js .LBB1_4 -; SSE41-NEXT: # %bb.3: ; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: .LBB1_4: -; SSE41-NEXT: maxss %xmm2, %xmm3 -; SSE41-NEXT: andnps %xmm3, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 +; SSE41-NEXT: .LBB1_3: +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxss %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -171,65 +168,57 @@ define float @test_v4f32(<4 x float> %a0) { ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: js .LBB2_2 -; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: js .LBB2_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: jmp .LBB2_3 +; SSE41-NEXT: .LBB2_1: +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: .LBB2_3: +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: movaps %xmm3, %xmm4 -; SSE41-NEXT: .LBB2_2: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: movaps %xmm4, %xmm2 -; SSE41-NEXT: cmpunordss %xmm4, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm5 -; SSE41-NEXT: andps %xmm4, %xmm5 -; SSE41-NEXT: js .LBB2_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: .LBB2_4: -; SSE41-NEXT: maxss %xmm3, %xmm4 -; SSE41-NEXT: andnps %xmm4, %xmm2 -; SSE41-NEXT: orps %xmm5, %xmm2 -; SSE41-NEXT: movd %xmm2, %eax +; SSE41-NEXT: maxss %xmm0, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: cmpunordss %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: movd %xmm4, %eax ; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: movaps %xmm2, %xmm4 -; SSE41-NEXT: js .LBB2_6 +; SSE41-NEXT: js .LBB2_4 ; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movaps %xmm1, %xmm4 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: jmp .LBB2_6 +; SSE41-NEXT: .LBB2_4: +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm4, %xmm2 ; SSE41-NEXT: .LBB2_6: -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: movaps %xmm4, %xmm3 -; SSE41-NEXT: cmpunordss %xmm4, %xmm3 -; SSE41-NEXT: movaps %xmm3, %xmm5 -; SSE41-NEXT: andps %xmm4, %xmm5 -; SSE41-NEXT: js .LBB2_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: .LBB2_8: -; SSE41-NEXT: maxss %xmm1, %xmm4 -; SSE41-NEXT: andnps %xmm4, %xmm3 -; SSE41-NEXT: orps %xmm5, %xmm3 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: movd %xmm3, %eax ; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: movaps %xmm3, %xmm2 -; SSE41-NEXT: js .LBB2_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: .LBB2_10: -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: cmpunordss %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm4 -; SSE41-NEXT: js .LBB2_12 -; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: js .LBB2_7 +; SSE41-NEXT: # %bb.8: ; SSE41-NEXT: movaps %xmm3, %xmm0 -; SSE41-NEXT: .LBB2_12: +; SSE41-NEXT: jmp .LBB2_9 +; SSE41-NEXT: .LBB2_7: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: .LBB2_9: +; SSE41-NEXT: movaps %xmm1, %xmm2 ; SSE41-NEXT: maxss %xmm0, %xmm2 -; SSE41-NEXT: andnps %xmm2, %xmm1 -; SSE41-NEXT: orps %xmm4, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: @@ -410,61 +399,53 @@ define float @test_v8f32(<8 x float> %a0) { ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: js .LBB3_2 -; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: js .LBB3_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: jmp .LBB3_3 +; SSE41-NEXT: .LBB3_1: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: .LBB3_3: ; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: .LBB3_2: -; SSE41-NEXT: movaps %xmm3, %xmm0 -; SSE41-NEXT: cmpunordss %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: js .LBB3_4 -; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movd %xmm3, %eax ; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: .LBB3_4: -; SSE41-NEXT: maxss %xmm1, %xmm3 -; SSE41-NEXT: andnps %xmm3, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: js .LBB3_6 +; SSE41-NEXT: js .LBB3_4 ; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movaps %xmm3, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: jmp .LBB3_6 +; SSE41-NEXT: .LBB3_4: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm1 ; SSE41-NEXT: .LBB3_6: -; SSE41-NEXT: movaps %xmm4, %xmm1 -; SSE41-NEXT: cmpunordss %xmm4, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm5 -; SSE41-NEXT: andps %xmm4, %xmm5 -; SSE41-NEXT: js .LBB3_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: .LBB3_8: -; SSE41-NEXT: maxss %xmm3, %xmm4 -; SSE41-NEXT: andnps %xmm4, %xmm1 -; SSE41-NEXT: orps %xmm5, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: movaps %xmm1, %xmm3 +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movd %xmm3, %eax ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: js .LBB3_10 -; SSE41-NEXT: # %bb.9: -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: .LBB3_10: +; SSE41-NEXT: js .LBB3_7 +; SSE41-NEXT: # %bb.8: ; SSE41-NEXT: movaps %xmm3, %xmm0 -; SSE41-NEXT: cmpunordss %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: js .LBB3_12 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: .LBB3_12: -; SSE41-NEXT: maxss %xmm2, %xmm3 -; SSE41-NEXT: andnps %xmm3, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: jmp .LBB3_9 +; SSE41-NEXT: .LBB3_7: +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: .LBB3_9: +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: @@ -747,73 +728,65 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: cmpunordps %xmm2, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: movaps %xmm1, %xmm3 ; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3 ; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movaps %xmm5, %xmm1 -; SSE41-NEXT: maxps %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm5, %xmm2 +; SSE41-NEXT: maxps %xmm3, %xmm2 ; SSE41-NEXT: movaps %xmm5, %xmm0 ; SSE41-NEXT: cmpunordps %xmm5, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE41-NEXT: movd %xmm2, %eax ; SSE41-NEXT: testl %eax, %eax +; SSE41-NEXT: js .LBB4_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: jmp .LBB4_3 +; SSE41-NEXT: .LBB4_1: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: .LBB4_3: ; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: js .LBB4_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: .LBB4_2: -; SSE41-NEXT: movaps %xmm3, %xmm0 -; SSE41-NEXT: cmpunordss %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: js .LBB4_4 -; SSE41-NEXT: # %bb.3: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: .LBB4_4: -; SSE41-NEXT: maxss %xmm2, %xmm3 -; SSE41-NEXT: andnps %xmm3, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: js .LBB4_6 +; SSE41-NEXT: js .LBB4_4 ; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: movaps %xmm3, %xmm4 +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: jmp .LBB4_6 +; SSE41-NEXT: .LBB4_4: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm1 ; SSE41-NEXT: .LBB4_6: -; SSE41-NEXT: movaps %xmm4, %xmm2 -; SSE41-NEXT: cmpunordss %xmm4, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm5 -; SSE41-NEXT: andps %xmm4, %xmm5 -; SSE41-NEXT: js .LBB4_8 -; SSE41-NEXT: # %bb.7: -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: .LBB4_8: -; SSE41-NEXT: maxss %xmm3, %xmm4 -; SSE41-NEXT: andnps %xmm4, %xmm2 -; SSE41-NEXT: orps %xmm5, %xmm2 -; SSE41-NEXT: movd %xmm2, %eax -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE41-NEXT: testl %eax, %eax -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: js .LBB4_10 -; SSE41-NEXT: # %bb.9: ; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: .LBB4_10: +; SSE41-NEXT: maxss %xmm0, %xmm3 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: testl %eax, %eax +; SSE41-NEXT: js .LBB4_7 +; SSE41-NEXT: # %bb.8: ; SSE41-NEXT: movaps %xmm3, %xmm0 -; SSE41-NEXT: cmpunordss %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: js .LBB4_12 -; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: jmp .LBB4_9 +; SSE41-NEXT: .LBB4_7: +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm2 +; SSE41-NEXT: .LBB4_9: ; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: .LBB4_12: -; SSE41-NEXT: maxss %xmm1, %xmm3 -; SSE41-NEXT: andnps %xmm3, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 +; SSE41-NEXT: maxss %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: cmpunordss %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: @@ -986,30 +959,52 @@ define float @test_v16f32(<16 x float> %a0) { ; define double @test_v2f64(<2 x double> %a0) { -; SSE-LABEL: test_v2f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: js .LBB5_2 -; SSE-NEXT: # %bb.1: -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: .LBB5_2: -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: cmpunordsd %xmm3, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm4 -; SSE-NEXT: andpd %xmm3, %xmm4 -; SSE-NEXT: js .LBB5_4 -; SSE-NEXT: # %bb.3: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: .LBB5_4: -; SSE-NEXT: maxsd %xmm2, %xmm3 -; SSE-NEXT: andnpd %xmm3, %xmm1 -; SSE-NEXT: orpd %xmm4, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: js .LBB5_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: .LBB5_2: +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm4 +; SSE2-NEXT: andpd %xmm3, %xmm4 +; SSE2-NEXT: js .LBB5_4 +; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: .LBB5_4: +; SSE2-NEXT: maxsd %xmm2, %xmm3 +; SSE2-NEXT: andnpd %xmm3, %xmm1 +; SSE2-NEXT: orpd %xmm4, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB5_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: jmp .LBB5_3 +; SSE41-NEXT: .LBB5_1: +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: .LBB5_3: +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: maxsd %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: @@ -1092,34 +1087,32 @@ define double @test_v4f64(<4 x double> %a0) { ; ; SSE41-LABEL: test_v4f64: ; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: blendvpd %xmm0, %xmm0, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: maxpd %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: maxpd %xmm2, %xmm3 ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: cmpunordpd %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE41-NEXT: movq %xmm3, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: js .LBB6_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: .LBB6_2: +; SSE41-NEXT: js .LBB6_1 +; SSE41-NEXT: # %bb.2: ; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm4 -; SSE41-NEXT: andpd %xmm3, %xmm4 -; SSE41-NEXT: js .LBB6_4 -; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: jmp .LBB6_3 +; SSE41-NEXT: .LBB6_1: +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: .LBB6_3: ; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: .LBB6_4: -; SSE41-NEXT: maxsd %xmm1, %xmm3 -; SSE41-NEXT: andnpd %xmm3, %xmm0 -; SSE41-NEXT: orpd %xmm4, %xmm0 +; SSE41-NEXT: maxsd %xmm0, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: @@ -1310,22 +1303,20 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: js .LBB7_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: .LBB7_2: -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm4 -; SSE41-NEXT: andpd %xmm3, %xmm4 -; SSE41-NEXT: js .LBB7_4 -; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: js .LBB7_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: jmp .LBB7_3 +; SSE41-NEXT: .LBB7_1: +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: .LBB7_4: -; SSE41-NEXT: maxsd %xmm2, %xmm3 -; SSE41-NEXT: andnpd %xmm3, %xmm0 -; SSE41-NEXT: orpd %xmm4, %xmm0 +; SSE41-NEXT: .LBB7_3: +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: maxsd %xmm0, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: @@ -1646,22 +1637,20 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] ; SSE41-NEXT: movq %xmm1, %rax ; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: js .LBB8_2 -; SSE41-NEXT: # %bb.1: -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: .LBB8_2: -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm4 -; SSE41-NEXT: andpd %xmm3, %xmm4 -; SSE41-NEXT: js .LBB8_4 -; SSE41-NEXT: # %bb.3: +; SSE41-NEXT: js .LBB8_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: jmp .LBB8_3 +; SSE41-NEXT: .LBB8_1: +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: .LBB8_4: -; SSE41-NEXT: maxsd %xmm2, %xmm3 -; SSE41-NEXT: andnpd %xmm3, %xmm0 -; SSE41-NEXT: orpd %xmm4, %xmm0 +; SSE41-NEXT: .LBB8_3: +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: maxsd %xmm0, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: @@ -1792,3 +1781,5 @@ declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>) declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>) declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>) declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index 5ae9e552d0dcd..727af12217c67 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW @@ -36,13 +36,10 @@ define float @test_v2f32(<2 x float> %a0) { ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: andps %xmm2, %xmm3 -; SSE41-NEXT: minss %xmm0, %xmm2 -; SSE41-NEXT: andnps %xmm2, %xmm1 -; SSE41-NEXT: orps %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minss %xmm0, %xmm1 +; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -99,31 +96,26 @@ define float @test_v4f32(<4 x float> %a0) { ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andps %xmm3, %xmm4 -; SSE41-NEXT: minss %xmm1, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: andnps %xmm3, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: minss %xmm0, %xmm3 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: minss %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minss %xmm0, %xmm1 ; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm4, %xmm2 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: cmpunordss %xmm2, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: @@ -199,35 +191,30 @@ define float @test_v8f32(<8 x float> %a0) { ; ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: minps %xmm0, %xmm2 -; SSE41-NEXT: cmpunordps %xmm0, %xmm0 -; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: cmpunordss %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm3 -; SSE41-NEXT: minss %xmm2, %xmm1 -; SSE41-NEXT: andnps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: minss %xmm0, %xmm3 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSE41-NEXT: minps %xmm0, %xmm3 +; SSE41-NEXT: cmpunordps %xmm0, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: minss %xmm0, %xmm1 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andnps %xmm1, %xmm3 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: minss %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: cmpunordss %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm3, %xmm4 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE41-NEXT: movaps %xmm4, %xmm2 +; SSE41-NEXT: minss %xmm1, %xmm2 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: cmpunordss %xmm2, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: @@ -391,36 +378,31 @@ define float @test_v16f32(<16 x float> %a0) { ; SSE41-NEXT: cmpunordps %xmm1, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: minps %xmm4, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm3 +; SSE41-NEXT: minps %xmm4, %xmm3 ; SSE41-NEXT: cmpunordps %xmm4, %xmm4 ; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3 +; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE41-NEXT: movaps %xmm2, %xmm1 +; SSE41-NEXT: minss %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: cmpunordss %xmm3, %xmm0 ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: cmpunordss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: movaps %xmm3, %xmm4 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE41-NEXT: movaps %xmm4, %xmm2 ; SSE41-NEXT: minss %xmm1, %xmm2 -; SSE41-NEXT: andnps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movaps %xmm2, %xmm3 -; SSE41-NEXT: minss %xmm0, %xmm3 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm4 -; SSE41-NEXT: andnps %xmm3, %xmm4 -; SSE41-NEXT: andps %xmm2, %xmm0 -; SSE41-NEXT: orps %xmm4, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: minss %xmm0, %xmm2 -; SSE41-NEXT: cmpunordss %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andnps %xmm2, %xmm3 -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: orps %xmm3, %xmm0 +; SSE41-NEXT: cmpunordss %xmm1, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: minss %xmm2, %xmm1 +; SSE41-NEXT: cmpunordss %xmm2, %xmm2 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: @@ -597,19 +579,30 @@ define float @test_v16f32(<16 x float> %a0) { ; define double @test_v2f64(<2 x double> %a0) { -; SSE-LABEL: test_v2f64: -; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: cmpunordsd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: andpd %xmm2, %xmm3 -; SSE-NEXT: minsd %xmm0, %xmm2 -; SSE-NEXT: andnpd %xmm2, %xmm1 -; SSE-NEXT: orpd %xmm3, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd %xmm0, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: minsd %xmm0, %xmm2 +; SSE2-NEXT: andnpd %xmm2, %xmm1 +; SSE2-NEXT: orpd %xmm3, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: minsd %xmm0, %xmm1 +; SSE41-NEXT: cmpunordsd %xmm0, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: @@ -661,15 +654,14 @@ define double @test_v3f64(<3 x double> %a0) { ; SSE41-NEXT: minpd %xmm0, %xmm1 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: minsd %xmm1, %xmm2 -; SSE41-NEXT: andnpd %xmm2, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v3f64: @@ -727,15 +719,14 @@ define double @test_v4f64(<4 x double> %a0) { ; SSE41-NEXT: minpd %xmm0, %xmm2 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: andpd %xmm1, %xmm3 +; SSE41-NEXT: movapd %xmm2, %xmm3 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE41-NEXT: movapd %xmm3, %xmm1 ; SSE41-NEXT: minsd %xmm2, %xmm1 -; SSE41-NEXT: andnpd %xmm1, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm2, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f64: @@ -823,15 +814,14 @@ define double @test_v8f64(<8 x double> %a0) { ; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: minsd %xmm1, %xmm2 -; SSE41-NEXT: andnpd %xmm2, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f64: @@ -1015,15 +1005,14 @@ define double @test_v16f64(<16 x double> %a0) { ; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: movapd %xmm1, %xmm3 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE41-NEXT: movapd %xmm3, %xmm2 ; SSE41-NEXT: minsd %xmm1, %xmm2 -; SSE41-NEXT: andnpd %xmm2, %xmm0 -; SSE41-NEXT: orpd %xmm3, %xmm0 +; SSE41-NEXT: cmpunordsd %xmm1, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll index 50040593848a2..b3bb01137c70d 100644 --- a/llvm/test/CodeGen/X86/vselect-zero.ll +++ b/llvm/test/CodeGen/X86/vselect-zero.ll @@ -113,14 +113,22 @@ define float @fsel_zero_true_val(float %a, float %b, float %x) { } define double @fsel_nonzero_false_val(double %x, double %y, double %z) { -; SSE-LABEL: fsel_nonzero_false_val: -; SSE: # %bb.0: -; SSE-NEXT: cmpeqsd %xmm1, %xmm0 -; SSE-NEXT: andpd %xmm0, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] -; SSE-NEXT: andnpd %xmm1, %xmm0 -; SSE-NEXT: orpd %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: fsel_nonzero_false_val: +; SSE2: # %bb.0: +; SSE2-NEXT: cmpeqsd %xmm1, %xmm0 +; SSE2-NEXT: andpd %xmm0, %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] +; SSE2-NEXT: andnpd %xmm1, %xmm0 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: fsel_nonzero_false_val: +; SSE42: # %bb.0: +; SSE42-NEXT: cmpeqsd %xmm1, %xmm0 +; SSE42-NEXT: movapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: movapd %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX-LABEL: fsel_nonzero_false_val: ; AVX: # %bb.0: @@ -142,14 +150,21 @@ define double @fsel_nonzero_false_val(double %x, double %y, double %z) { } define double @fsel_nonzero_true_val(double %x, double %y, double %z) { -; SSE-LABEL: fsel_nonzero_true_val: -; SSE: # %bb.0: -; SSE-NEXT: cmpeqsd %xmm1, %xmm0 -; SSE-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] -; SSE-NEXT: andpd %xmm0, %xmm1 -; SSE-NEXT: andnpd %xmm2, %xmm0 -; SSE-NEXT: orpd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: fsel_nonzero_true_val: +; SSE2: # %bb.0: +; SSE2-NEXT: cmpeqsd %xmm1, %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] +; SSE2-NEXT: andpd %xmm0, %xmm1 +; SSE2-NEXT: andnpd %xmm2, %xmm0 +; SSE2-NEXT: orpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: fsel_nonzero_true_val: +; SSE42: # %bb.0: +; SSE42-NEXT: cmpeqsd %xmm1, %xmm0 +; SSE42-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE42-NEXT: movapd %xmm2, %xmm0 +; SSE42-NEXT: retq ; ; AVX-LABEL: fsel_nonzero_true_val: ; AVX: # %bb.0: From 65a92544f7716541cdfab99499ce467b26a3ce8e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 10 Feb 2025 11:27:26 +0000 Subject: [PATCH 137/293] [X86] canonicalizeShuffleWithOp - pull out repeated flag settings to IsMergeableWithShuffle lambda. NFC. Prep work before tweaking the flags in a future patch. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4e7ee83d00488..34ac4262beb85 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41598,10 +41598,10 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) { SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); - if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI, - Opc != X86ISD::PSHUFB) || - IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI, - Opc != X86ISD::PSHUFB)) { + bool FoldShuf = Opc != X86ISD::VPERMI; + bool FoldLoad = Opc != X86ISD::PSHUFB; + if (IsMergeableWithShuffle(Op00, FoldShuf, FoldLoad) || + IsMergeableWithShuffle(Op01, FoldShuf, FoldLoad)) { SDValue LHS, RHS; Op00 = DAG.getBitcast(ShuffleVT, Op00); Op01 = DAG.getBitcast(ShuffleVT, Op01); From 7ee56b9afcad456bb662aad941052af334fe3a11 Mon Sep 17 00:00:00 2001 From: wldfngrs Date: Mon, 10 Feb 2025 12:38:55 +0100 Subject: [PATCH 138/293] [libc][math][c23] Add asinf16() function (#124212) Co-authored-by: OverMighty --- libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/headers/math/index.rst | 2 +- libc/include/math.yaml | 7 ++ libc/src/math/CMakeLists.txt | 2 + libc/src/math/asinf16.h | 21 ++++ libc/src/math/generic/CMakeLists.txt | 19 ++++ libc/src/math/generic/asinf16.cpp | 133 ++++++++++++++++++++++ libc/test/src/math/CMakeLists.txt | 11 ++ libc/test/src/math/asinf16_test.cpp | 42 +++++++ libc/test/src/math/smoke/CMakeLists.txt | 11 ++ libc/test/src/math/smoke/asinf16_test.cpp | 42 +++++++ 11 files changed, 290 insertions(+), 1 deletion(-) create mode 100644 libc/src/math/asinf16.h create mode 100644 libc/src/math/generic/asinf16.cpp create mode 100644 libc/test/src/math/asinf16_test.cpp create mode 100644 libc/test/src/math/smoke/asinf16_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 81dceb74a1774..0a942516db6c3 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -647,6 +647,7 @@ endif() if(LIBC_TYPES_HAS_FLOAT16) list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float16 entrypoints + libc.src.math.asinf16 libc.src.math.canonicalizef16 libc.src.math.ceilf16 libc.src.math.copysignf16 diff --git a/libc/docs/headers/math/index.rst b/libc/docs/headers/math/index.rst index 8548e4a5773bc..3e45e3e618abb 100644 --- a/libc/docs/headers/math/index.rst +++ b/libc/docs/headers/math/index.rst @@ -256,7 +256,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | acospi | | | | | | 7.12.4.8 | F.10.1.8 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| asin | |check| | | | | | 7.12.4.2 | F.10.1.2 | +| asin | |check| | | | |check| | | 7.12.4.2 | F.10.1.2 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | asinh | |check| | | | | | 7.12.5.2 | F.10.2.2 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/include/math.yaml b/libc/include/math.yaml index 3a660a59d3605..b98bc55f6cc53 100644 --- a/libc/include/math.yaml +++ b/libc/include/math.yaml @@ -32,6 +32,13 @@ functions: return_type: float arguments: - type: float + - name: asinf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: asinhf standards: - stdc diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index fe5ebd793b40a..82551a4b57f45 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -47,6 +47,8 @@ add_math_entrypoint_object(acoshf) add_math_entrypoint_object(asin) add_math_entrypoint_object(asinf) +add_math_entrypoint_object(asinf16) + add_math_entrypoint_object(asinh) add_math_entrypoint_object(asinhf) diff --git a/libc/src/math/asinf16.h b/libc/src/math/asinf16.h new file mode 100644 index 0000000000000..f16647ec2a6f9 --- /dev/null +++ b/libc/src/math/asinf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for asinf16 -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_ASINF16_H +#define LLVM_LIBC_SRC_MATH_ASINF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 asinf16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_ASINF16_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 14e63d6cc1395..cd34d1cd0b914 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -4598,6 +4598,25 @@ add_entrypoint_object( ${libc_opt_high_flag} ) +add_entrypoint_object( + asinf16 + SRCS + asinf16.cpp + HDRS + ../asinf16.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.sqrt + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.types +) + add_entrypoint_object( acosf SRCS diff --git a/libc/src/math/generic/asinf16.cpp b/libc/src/math/generic/asinf16.cpp new file mode 100644 index 0000000000000..518c384a61530 --- /dev/null +++ b/libc/src/math/generic/asinf16.cpp @@ -0,0 +1,133 @@ +//===-- Half-precision asinf16(x) function --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#include "src/math/asinf16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/sqrt.h" +#include "src/__support/macros/optimization.h" + +namespace LIBC_NAMESPACE_DECL { + +// Generated by Sollya using the following command: +// > round(pi/2, D, RN); +static constexpr float PI_2 = 0x1.921fb54442d18p0f; + +LLVM_LIBC_FUNCTION(float16, asinf16, (float16 x)) { + using FPBits = fputil::FPBits; + FPBits xbits(x); + + uint16_t x_u = xbits.uintval(); + uint16_t x_abs = x_u & 0x7fff; + float xf = x; + + // |x| > 0x1p0, |x| > 1, or x is NaN. + if (LIBC_UNLIKELY(x_abs > 0x3c00)) { + // asinf16(NaN) = NaN + if (xbits.is_nan()) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // 1 < |x| <= +/-inf + fputil::raise_except_if_required(FE_INVALID); + fputil::set_errno_if_required(EDOM); + + return FPBits::quiet_nan().get_val(); + } + + float xsq = xf * xf; + + // |x| <= 0x1p-1, |x| <= 0.5 + if (x_abs <= 0x3800) { + // asinf16(+/-0) = +/-0 + if (LIBC_UNLIKELY(x_abs == 0)) + return x; + + // Exhaustive tests show that, + // for |x| <= 0x1.878p-9, when: + // x > 0, and rounding upward, or + // x < 0, and rounding downward, then, + // asin(x) = x * 2^-11 + x + // else, in other rounding modes, + // asin(x) = x + if (LIBC_UNLIKELY(x_abs <= 0x1a1e)) { + int rounding = fputil::quick_get_round(); + + if ((xbits.is_pos() && rounding == FE_UPWARD) || + (xbits.is_neg() && rounding == FE_DOWNWARD)) + return fputil::cast(fputil::multiply_add(xf, 0x1.0p-11f, xf)); + return x; + } + + // Degree-6 minimax odd polynomial of asin(x) generated by Sollya with: + // > P = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8|], [|SG...|], [0, 0.5]); + float result = + fputil::polyeval(xsq, 0x1.000002p0f, 0x1.554c2ap-3f, 0x1.3541ccp-4f, + 0x1.43b2d6p-5f, 0x1.a0d73ep-5f); + return fputil::cast(xf * result); + } + + // When |x| > 0.5, assume that 0.5 < |x| <= 1, + // + // Step-by-step range-reduction proof: + // 1: Let y = asin(x), such that, x = sin(y) + // 2: From complimentary angle identity: + // x = sin(y) = cos(pi/2 - y) + // 3: Let z = pi/2 - y, such that x = cos(z) + // 4: From double angle formula; cos(2A) = 1 - sin^2(A): + // z = 2A, z/2 = A + // cos(z) = 1 - 2 * sin^2(z/2) + // 5: Make sin(z/2) subject of the formula: + // sin(z/2) = sqrt((1 - cos(z))/2) + // 6: Recall [3]; x = cos(z). Therefore: + // sin(z/2) = sqrt((1 - x)/2) + // 7: Let u = (1 - x)/2 + // 8: Therefore: + // asin(sqrt(u)) = z/2 + // 2 * asin(sqrt(u)) = z + // 9: Recall [3], z = pi/2 - y. Therefore: + // y = pi/2 - z + // y = pi/2 - 2 * asin(sqrt(u)) + // 10: Recall [1], y = asin(x). Therefore: + // asin(x) = pi/2 - 2 * asin(sqrt(u)) + // + // WHY? + // 11: Recall [7], u = (1 - x)/2 + // 12: Since 0.5 < x <= 1, therefore: + // 0 <= u <= 0.25 and 0 <= sqrt(u) <= 0.5 + // + // Hence, we can reuse the same [0, 0.5] domain polynomial approximation for + // Step [10] as `sqrt(u)` is in range. + + // 0x1p-1 < |x| <= 0x1p0, 0.5 < |x| <= 1.0 + float xf_abs = (xf < 0 ? -xf : xf); + float sign = (xbits.uintval() >> 15 == 1 ? -1.0 : 1.0); + float u = fputil::multiply_add(-0.5f, xf_abs, 0.5f); + float u_sqrt = fputil::sqrt(u); + + // Degree-6 minimax odd polynomial of asin(x) generated by Sollya with: + // > P = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8|], [|SG...|], [0, 0.5]); + float asin_sqrt_u = + u_sqrt * fputil::polyeval(u, 0x1.000002p0f, 0x1.554c2ap-3f, + 0x1.3541ccp-4f, 0x1.43b2d6p-5f, 0x1.a0d73ep-5f); + + return fputil::cast(sign * + fputil::multiply_add(-2.0f, asin_sqrt_u, PI_2)); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index f000ff6f3cf47..6a3dd8c9deff0 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -2186,6 +2186,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + asinf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + asinf16_test.cpp + DEPENDS + libc.src.math.asinf16 +) + add_fp_unittest( acosf_test NEED_MPFR diff --git a/libc/test/src/math/asinf16_test.cpp b/libc/test/src/math/asinf16_test.cpp new file mode 100644 index 0000000000000..9593cad16ac77 --- /dev/null +++ b/libc/test/src/math/asinf16_test.cpp @@ -0,0 +1,42 @@ +//===-- Exhaustive test for asinf16 ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/asinf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcAsinf16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf] +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0] +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcAsinf16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, x, + LIBC_NAMESPACE::asinf16(x), 0.5); + } +} + +TEST_F(LlvmLibcAsinf16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, x, + LIBC_NAMESPACE::asinf16(x), 0.5); + } +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index f3ecba3737e38..14447728fb18a 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -3957,6 +3957,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + asinf16_test + SUITE + libc-math-smoke-tests + SRCS + asinf16_test.cpp + DEPENDS + libc.src.errno.errno + libc.src.math.asinf16 +) + add_fp_unittest( acosf_test SUITE diff --git a/libc/test/src/math/smoke/asinf16_test.cpp b/libc/test/src/math/smoke/asinf16_test.cpp new file mode 100644 index 0000000000000..9f675b08319c0 --- /dev/null +++ b/libc/test/src/math/smoke/asinf16_test.cpp @@ -0,0 +1,42 @@ +//===-- Unittests for asinf16 ---------------------------------------------===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#include "src/errno/libc_errno.h" +#include "src/math/asinf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcAsinf16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcAsinf16Test, SpecialNumbers) { + LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(aNaN)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinf16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(zero, LIBC_NAMESPACE::asinf16(zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::asinf16(neg_zero)); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(2.0f)); + EXPECT_MATH_ERRNO(EDOM); + + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(-2.0f)); + EXPECT_MATH_ERRNO(EDOM); +} From 738cf5acc68c697dad5611b2424aa6b124b368f2 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 10 Feb 2025 11:42:06 +0000 Subject: [PATCH 139/293] InstSimplify: improve computePointerICmp (NFC) (#126255) The comment about inbounds protecting only against unsigned wrapping is incorrect: it also protects against signed wrapping, but the issue is that it could cross the sign boundary. --- llvm/lib/Analysis/InstructionSimplify.cpp | 27 ++++++----------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 3cbc4107433ef..59002cd934ab1 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2686,27 +2686,14 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, const DataLayout &DL = Q.DL; const TargetLibraryInfo *TLI = Q.TLI; - // We can only fold certain predicates on pointer comparisons. - switch (Pred) { - default: + // We fold equality and unsigned predicates on pointer comparisons, but forbid + // signed predicates since a GEP with inbounds could cross the sign boundary. + if (CmpInst::isSigned(Pred)) return nullptr; - // Equality comparisons are easy to fold. - case CmpInst::ICMP_EQ: - case CmpInst::ICMP_NE: - break; - - // We can only handle unsigned relational comparisons because 'inbounds' on - // a GEP only protects against unsigned wrapping. - case CmpInst::ICMP_UGT: - case CmpInst::ICMP_UGE: - case CmpInst::ICMP_ULT: - case CmpInst::ICMP_ULE: - // However, we have to switch them to their signed variants to handle - // negative indices from the base pointer. - Pred = ICmpInst::getSignedPredicate(Pred); - break; - } + // We have to switch to a signed predicate to handle negative indices from + // the base pointer. + Pred = ICmpInst::getSignedPredicate(Pred); // Strip off any constant offsets so that we can reason about them. // It's tempting to use getUnderlyingObject or even just stripInBoundsOffsets @@ -2730,7 +2717,7 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, ICmpInst::compare(LHSOffset, RHSOffset, Pred)); // Various optimizations for (in)equality comparisons. - if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) { + if (ICmpInst::isEquality(Pred)) { // Different non-empty allocations that exist at the same time have // different addresses (if the program can tell). If the offsets are // within the bounds of their allocations (and not one-past-the-end! From 0b5c318127b1ed8125bffd5df1c96067c2186878 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Mon, 10 Feb 2025 19:44:24 +0800 Subject: [PATCH 140/293] [LoongArch] Merge base and offset for tls-le code sequence (#122999) Adapt the merge base offset pass to optimize the tls-le code sequence. --- .../LoongArch/LoongArchMergeBaseOffset.cpp | 165 ++++++++- .../LoongArch/machinelicm-address-pseudos.ll | 6 +- .../LoongArch/merge-base-offset-tlsle.ll | 318 +++++++----------- 3 files changed, 266 insertions(+), 223 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp b/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp index 7f98f7718a538..2aae498e1f2de 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp @@ -37,6 +37,8 @@ class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass { bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12, MachineInstr *&Lo20, MachineInstr *&Hi12, MachineInstr *&Last); + bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add, + MachineInstr *&Lo12); bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, MachineInstr *&Hi12, @@ -176,7 +178,80 @@ bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20, return true; } -// Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions. +// Detect the pattern: +// +// (small/medium): +// lu12i.w vreg1, %le_hi20_r(s) +// add.w/d vreg2, vreg1, r2, %le_add_r(s) +// addi.w/d vreg3, vreg2, %le_lo12_r(s) + +// The pattern is only accepted if: +// 1) The first instruction has only one use, which is the PseudoAddTPRel. +// The second instruction has only one use, which is the ADDI. The +// second instruction's last operand is the tp register. +// 2) The address operands have the appropriate type, reflecting the +// lowering of a thread_local global address using the pattern. +// 3) The offset value in the ThreadLocal Global Address is 0. +bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20, + MachineInstr *&Add, + MachineInstr *&Lo12) { + if (Hi20.getOpcode() != LoongArch::LU12I_W) + return false; + + auto isGlobalOrCPI = [](const MachineOperand &Op) { + return Op.isGlobal() || Op.isCPI(); + }; + + const MachineOperand &Hi20Op1 = Hi20.getOperand(1); + if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R || + !isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0) + return false; + + Register HiDestReg = Hi20.getOperand(0).getReg(); + if (!MRI->hasOneUse(HiDestReg)) + return false; + + Add = &*MRI->use_instr_begin(HiDestReg); + if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) || + (!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W)) + return false; + + if (Add->getOperand(2).getReg() != LoongArch::R2) + return false; + + const MachineOperand &AddOp3 = Add->getOperand(3); + if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R || + !(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) || + AddOp3.getOffset() != 0) + return false; + + Register AddDestReg = Add->getOperand(0).getReg(); + if (!MRI->hasOneUse(AddDestReg)) + return false; + + Lo12 = &*MRI->use_instr_begin(AddDestReg); + if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) || + (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W)) + return false; + + const MachineOperand &Lo12Op2 = Lo12->getOperand(2); + if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R || + !(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) || + Lo12Op2.getOffset() != 0) + return false; + + if (Hi20Op1.isGlobal()) { + LLVM_DEBUG(dbgs() << " Found lowered global address: " + << *Hi20Op1.getGlobal() << "\n"); + } else if (Hi20Op1.isCPI()) { + LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex() + << "\n"); + } + + return true; +} + +// Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions. // Delete the tail instruction and update all the uses to use the // output from Last. void LoongArchMergeBaseOffsetOpt::foldOffset( @@ -190,31 +265,49 @@ void LoongArchMergeBaseOffsetOpt::foldOffset( Lo20->getOperand(2).setOffset(Offset); Hi12->getOperand(2).setOffset(Offset); } + + // For tls-le, offset of the second PseudoAddTPRel instr should also be + // updated. + MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); + if (Hi20.getOpcode() == LoongArch::LU12I_W) + Add->getOperand(3).setOffset(Offset); + // Delete the tail instruction. MachineInstr *Def = Last ? Last : &Lo12; MRI->constrainRegClass(Def->getOperand(0).getReg(), MRI->getRegClass(Tail.getOperand(0).getReg())); MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg()); Tail.eraseFromParent(); + LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n" - << " " << Hi20 << " " << Lo12;); + << " " << Hi20;); + if (Hi20.getOpcode() == LoongArch::LU12I_W) { + LLVM_DEBUG(dbgs() << " " << *Add;); + } + LLVM_DEBUG(dbgs() << " " << Lo12;); if (Lo20 && Hi12) { LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;); } } // Detect patterns for large offsets that are passed into an ADD instruction. -// If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12 -// instructions and deletes TailAdd and the instructions that produced the -// offset. +// If the pattern is found, updates the offset in Hi20, (Add), Lo12, +// (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that +// produced the offset. // // (The instructions marked with "!" are not necessarily present) // // Base address lowering is of the form: -// Hi20: pcalau12i vreg1, %pc_hi20(s) -// +- Lo12: addi.d vreg2, vreg1, %pc_lo12(s) -// | Lo20: lu32i.d vreg2, %pc64_lo20(s) ! -// +- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) ! +// 1) pcala: +// Hi20: pcalau12i vreg1, %pc_hi20(s) +// +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s) +// | Lo20: lu32i.d vreg2, %pc64_lo20(s) ! +// +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) ! +// | +// | 2) tls-le: +// | Hi20: lu12i.w vreg1, %le_hi20_r(s) +// | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s) +// +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s) // | // | The large offset can be one of the forms: // | @@ -334,7 +427,8 @@ bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20, // Look for arithmetic instructions we can get an offset from. // We might be able to remove the arithmetic instructions by folding the - // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I). + // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or + // LU12I_W+PseudoAddTPRel+ADDI. if (!MRI->hasOneUse(DestReg)) return false; @@ -454,6 +548,7 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20, // If all the uses are memory ops with the same offset, we can transform: // // 1. (small/medium): + // 1.1. pcala // pcalau12i vreg1, %pc_hi20(s) // addi.d vreg2, vreg1, %pc_lo12(s) // ld.w vreg3, 8(vreg2) @@ -463,6 +558,18 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20, // pcalau12i vreg1, %pc_hi20(s+8) // ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1) // + // 1.2. tls-le + // lu12i.w vreg1, %le_hi20_r(s) + // add.w/d vreg2, vreg1, r2, %le_add_r(s) + // addi.w/d vreg3, vreg2, %le_lo12_r(s) + // ld.w vreg4, 8(vreg3) + // + // => + // + // lu12i.w vreg1, %le_hi20_r(s+8) + // add.w/d vreg2, vreg1, r2, %le_add_r(s+8) + // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2) + // // 2. (large): // pcalau12i vreg1, %pc_hi20(s) // addi.d vreg2, $zero, %pc_lo12(s) @@ -598,7 +705,8 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20, return false; // If optimized by this pass successfully, MO_RELAX bitmask target-flag should - // be removed from the code sequence. + // be removed from the pcala code sequence. Code sequence of tls-le can still + // be relaxed after being optimized. // // For example: // pcalau12i $a0, %pc_hi20(symbol) @@ -614,15 +722,20 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20, // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be // carried by them. Hi20.getOperand(1).setOffset(NewOffset); - Hi20.getOperand(1).setTargetFlags( - LoongArchII::getDirectFlags(Hi20.getOperand(1))); MachineOperand &ImmOp = Lo12.getOperand(2); ImmOp.setOffset(NewOffset); - ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp)); if (Lo20 && Hi12) { Lo20->getOperand(2).setOffset(NewOffset); Hi12->getOperand(2).setOffset(NewOffset); } + if (Hi20.getOpcode() == LoongArch::PCALAU12I) { + Hi20.getOperand(1).setTargetFlags( + LoongArchII::getDirectFlags(Hi20.getOperand(1))); + ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp)); + } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { + MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); + Add->getOperand(3).setOffset(NewOffset); + } // Update the immediate in the load/store instructions to add the offset. const LoongArchInstrInfo &TII = *ST->getInstrInfo(); @@ -673,7 +786,14 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20, return true; } - MRI->replaceRegWith(Lo12.getOperand(0).getReg(), Hi20.getOperand(0).getReg()); + if (Hi20.getOpcode() == LoongArch::PCALAU12I) { + MRI->replaceRegWith(Lo12.getOperand(0).getReg(), + Hi20.getOperand(0).getReg()); + } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { + MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); + MRI->replaceRegWith(Lo12.getOperand(0).getReg(), + Add->getOperand(0).getReg()); + } Lo12.eraseFromParent(); return true; } @@ -693,8 +813,21 @@ bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) { MachineInstr *Lo20 = nullptr; MachineInstr *Hi12 = nullptr; MachineInstr *Last = nullptr; - if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last)) + if (Hi20.getOpcode() == LoongArch::PCALAU12I) { + // Detect foldable pcala code sequence in small/medium/large code model. + if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last)) + continue; + } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { + MachineInstr *Add = nullptr; + // Detect foldable tls-le code sequence in small/medium code model. + if (!detectFoldable(Hi20, Add, Lo12)) + continue; + } else { continue; + } + // For tls-le, we do not pass the second PseudoAddTPRel instr in order to + // reuse the existing hooks and the last three paramaters should always be + // nullptr. MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last); MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last); } diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll index e0a93e3051bf8..92d079ab3a8d8 100644 --- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll +++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll @@ -317,11 +317,10 @@ define void @test_la_tls_le(i32 signext %n) { ; LA32-NEXT: move $a1, $zero ; LA32-NEXT: lu12i.w $a2, %le_hi20_r(le) ; LA32-NEXT: add.w $a2, $a2, $tp, %le_add_r(le) -; LA32-NEXT: addi.w $a2, $a2, %le_lo12_r(le) ; LA32-NEXT: .p2align 4, , 16 ; LA32-NEXT: .LBB4_1: # %loop ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 -; LA32-NEXT: ld.w $zero, $a2, 0 +; LA32-NEXT: ld.w $zero, $a2, %le_lo12_r(le) ; LA32-NEXT: addi.w $a1, $a1, 1 ; LA32-NEXT: blt $a1, $a0, .LBB4_1 ; LA32-NEXT: # %bb.2: # %ret @@ -332,11 +331,10 @@ define void @test_la_tls_le(i32 signext %n) { ; LA64-NEXT: move $a1, $zero ; LA64-NEXT: lu12i.w $a2, %le_hi20_r(le) ; LA64-NEXT: add.d $a2, $a2, $tp, %le_add_r(le) -; LA64-NEXT: addi.d $a2, $a2, %le_lo12_r(le) ; LA64-NEXT: .p2align 4, , 16 ; LA64-NEXT: .LBB4_1: # %loop ; LA64-NEXT: # =>This Inner Loop Header: Depth=1 -; LA64-NEXT: ld.w $zero, $a2, 0 +; LA64-NEXT: ld.w $zero, $a2, %le_lo12_r(le) ; LA64-NEXT: addi.w $a1, $a1, 1 ; LA64-NEXT: blt $a1, $a0, .LBB4_1 ; LA64-NEXT: # %bb.2: # %ret diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll index 7e995d224ce1d..9ed9a865ce55d 100644 --- a/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll +++ b/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll @@ -11,16 +11,14 @@ define dso_local signext i8 @tlsle_load_s8() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) -; LA32-NEXT: ld.b $a0, $a0, 0 +; LA32-NEXT: ld.b $a0, $a0, %le_lo12_r(g_i8) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_s8: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) -; LA64-NEXT: ld.b $a0, $a0, 0 +; LA64-NEXT: ld.b $a0, $a0, %le_lo12_r(g_i8) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) @@ -33,16 +31,14 @@ define dso_local zeroext i8 @tlsle_load_u8() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) -; LA32-NEXT: ld.bu $a0, $a0, 0 +; LA32-NEXT: ld.bu $a0, $a0, %le_lo12_r(g_i8) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_u8: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) -; LA64-NEXT: ld.bu $a0, $a0, 0 +; LA64-NEXT: ld.bu $a0, $a0, %le_lo12_r(g_i8) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) @@ -55,18 +51,16 @@ define dso_local void @tlsle_store_i8() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) ; LA32-NEXT: ori $a1, $zero, 1 -; LA32-NEXT: st.b $a1, $a0, 0 +; LA32-NEXT: st.b $a1, $a0, %le_lo12_r(g_i8) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_store_i8: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) ; LA64-NEXT: ori $a1, $zero, 1 -; LA64-NEXT: st.b $a1, $a0, 0 +; LA64-NEXT: st.b $a1, $a0, %le_lo12_r(g_i8) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) @@ -81,16 +75,14 @@ define dso_local signext i16 @tlsle_load_s16() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16) -; LA32-NEXT: ld.h $a0, $a0, 0 +; LA32-NEXT: ld.h $a0, $a0, %le_lo12_r(g_i16) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_s16: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16) -; LA64-NEXT: ld.h $a0, $a0, 0 +; LA64-NEXT: ld.h $a0, $a0, %le_lo12_r(g_i16) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16) @@ -103,16 +95,14 @@ define dso_local zeroext i16 @tlsle_load_u16() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16) -; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: ld.hu $a0, $a0, %le_lo12_r(g_i16) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_u16: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16) -; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: ld.hu $a0, $a0, %le_lo12_r(g_i16) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16) @@ -125,18 +115,16 @@ define dso_local void @tlsle_store_i16() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16) ; LA32-NEXT: ori $a1, $zero, 1 -; LA32-NEXT: st.h $a1, $a0, 0 +; LA32-NEXT: st.h $a1, $a0, %le_lo12_r(g_i16) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_store_i16: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16) ; LA64-NEXT: ori $a1, $zero, 1 -; LA64-NEXT: st.h $a1, $a0, 0 +; LA64-NEXT: st.h $a1, $a0, %le_lo12_r(g_i16) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16) @@ -151,16 +139,14 @@ define dso_local signext i32 @tlsle_load_s32() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32) -; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, %le_lo12_r(g_i32) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_s32: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32) -; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: ld.w $a0, $a0, %le_lo12_r(g_i32) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32) @@ -173,16 +159,14 @@ define dso_local zeroext i32 @tlsle_load_u32() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32) -; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, %le_lo12_r(g_i32) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_u32: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32) -; LA64-NEXT: ld.wu $a0, $a0, 0 +; LA64-NEXT: ld.wu $a0, $a0, %le_lo12_r(g_i32) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32) @@ -195,18 +179,16 @@ define dso_local void @tlsle_store_i32() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32) ; LA32-NEXT: ori $a1, $zero, 1 -; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a1, $a0, %le_lo12_r(g_i32) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_store_i32: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32) ; LA64-NEXT: ori $a1, $zero, 1 -; LA64-NEXT: st.w $a1, $a0, 0 +; LA64-NEXT: st.w $a1, $a0, %le_lo12_r(g_i32) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32) @@ -230,8 +212,7 @@ define dso_local i64 @tlsle_load_i64() nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i64) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i64) -; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: ld.d $a0, $a0, %le_lo12_r(g_i64) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i64) @@ -254,9 +235,8 @@ define dso_local void @tlsle_store_i64() nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i64) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i64) ; LA64-NEXT: ori $a1, $zero, 1 -; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a1, $a0, %le_lo12_r(g_i64) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i64) @@ -271,16 +251,14 @@ define dso_local float @tlsle_load_f32() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_f32) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_f32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_f32) -; LA32-NEXT: fld.s $fa0, $a0, 0 +; LA32-NEXT: fld.s $fa0, $a0, %le_lo12_r(g_f32) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_f32: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_f32) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_f32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_f32) -; LA64-NEXT: fld.s $fa0, $a0, 0 +; LA64-NEXT: fld.s $fa0, $a0, %le_lo12_r(g_f32) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_f32) @@ -293,18 +271,16 @@ define dso_local void @tlsle_store_f32() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_f32) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_f32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_f32) ; LA32-NEXT: lu12i.w $a1, 260096 -; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a1, $a0, %le_lo12_r(g_f32) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_store_f32: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_f32) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_f32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_f32) ; LA64-NEXT: lu12i.w $a1, 260096 -; LA64-NEXT: st.w $a1, $a0, 0 +; LA64-NEXT: st.w $a1, $a0, %le_lo12_r(g_f32) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_f32) @@ -319,16 +295,14 @@ define dso_local double @tlsle_load_f64() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_f64) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_f64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_f64) -; LA32-NEXT: fld.d $fa0, $a0, 0 +; LA32-NEXT: fld.d $fa0, $a0, %le_lo12_r(g_f64) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_f64: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_f64) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_f64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_f64) -; LA64-NEXT: fld.d $fa0, $a0, 0 +; LA64-NEXT: fld.d $fa0, $a0, %le_lo12_r(g_f64) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_f64) @@ -341,18 +315,16 @@ define dso_local void @tlsle_store_f64() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_f64) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_f64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_f64) ; LA32-NEXT: vldi $vr0, -912 -; LA32-NEXT: fst.d $fa0, $a0, 0 +; LA32-NEXT: fst.d $fa0, $a0, %le_lo12_r(g_f64) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_store_f64: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_f64) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_f64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_f64) ; LA64-NEXT: lu52i.d $a1, $zero, 1023 -; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a1, $a0, %le_lo12_r(g_f64) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_f64) @@ -380,11 +352,10 @@ define dso_local void @tlsle_store_multi() nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_m64) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_m64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_m64) ; LA64-NEXT: ori $a1, $zero, 1 -; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a1, $a0, %le_lo12_r(g_m64) ; LA64-NEXT: ori $a1, $zero, 2 -; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a1, $a0, %le_lo12_r(g_m64) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_m64) @@ -400,18 +371,16 @@ define dso_local void @tlsle_store_sf32() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_sf32) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_sf32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_sf32) -; LA32-NEXT: fld.s $fa0, $a0, 0 -; LA32-NEXT: fst.s $fa0, $a0, 0 +; LA32-NEXT: fld.s $fa0, $a0, %le_lo12_r(g_sf32) +; LA32-NEXT: fst.s $fa0, $a0, %le_lo12_r(g_sf32) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_store_sf32: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_sf32) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_sf32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_sf32) -; LA64-NEXT: fld.s $fa0, $a0, 0 -; LA64-NEXT: fst.s $fa0, $a0, 0 +; LA64-NEXT: fld.s $fa0, $a0, %le_lo12_r(g_sf32) +; LA64-NEXT: fst.s $fa0, $a0, %le_lo12_r(g_sf32) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_sf32) @@ -427,18 +396,16 @@ define dso_local void @tlsle_store_sf64() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_sf64) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_sf64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_sf64) -; LA32-NEXT: fld.d $fa0, $a0, 0 -; LA32-NEXT: fst.d $fa0, $a0, 0 +; LA32-NEXT: fld.d $fa0, $a0, %le_lo12_r(g_sf64) +; LA32-NEXT: fst.d $fa0, $a0, %le_lo12_r(g_sf64) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_store_sf64: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_sf64) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_sf64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_sf64) -; LA64-NEXT: fld.d $fa0, $a0, 0 -; LA64-NEXT: fst.d $fa0, $a0, 0 +; LA64-NEXT: fld.d $fa0, $a0, %le_lo12_r(g_sf64) +; LA64-NEXT: fst.d $fa0, $a0, %le_lo12_r(g_sf64) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_sf64) @@ -455,24 +422,20 @@ define dso_local void @tlsle_copy_i32x4() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x4_src) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32x4_src) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32x4_src) -; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vld $vr0, $a0, %le_lo12_r(g_i32x4_src) ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x4_dst) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32x4_dst) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32x4_dst) -; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: vst $vr0, $a0, %le_lo12_r(g_i32x4_dst) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_copy_i32x4: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x4_src) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32x4_src) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32x4_src) -; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vld $vr0, $a0, %le_lo12_r(g_i32x4_src) ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x4_dst) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32x4_dst) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32x4_dst) -; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: vst $vr0, $a0, %le_lo12_r(g_i32x4_dst) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32x4_src) @@ -490,24 +453,20 @@ define dso_local void @tlsle_copy_i32x8() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x8_src) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32x8_src) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32x8_src) -; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvld $xr0, $a0, %le_lo12_r(g_i32x8_src) ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x8_dst) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32x8_dst) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32x8_dst) -; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: xvst $xr0, $a0, %le_lo12_r(g_i32x8_dst) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_copy_i32x8: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x8_src) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32x8_src) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32x8_src) -; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvld $xr0, $a0, %le_lo12_r(g_i32x8_src) ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32x8_dst) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32x8_dst) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32x8_dst) -; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: xvst $xr0, $a0, %le_lo12_r(g_i32x8_dst) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32x8_src) @@ -524,24 +483,20 @@ define dso_local void @tlsle_copy_i8_to_i8x16() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) -; LA32-NEXT: vldrepl.b $vr0, $a0, 0 +; LA32-NEXT: vldrepl.b $vr0, $a0, %le_lo12_r(g_i8) ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8x16) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8x16) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8x16) -; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: vst $vr0, $a0, %le_lo12_r(g_i8x16) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_copy_i8_to_i8x16: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) -; LA64-NEXT: vldrepl.b $vr0, $a0, 0 +; LA64-NEXT: vldrepl.b $vr0, $a0, %le_lo12_r(g_i8) ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8x16) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8x16) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8x16) -; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: vst $vr0, $a0, %le_lo12_r(g_i8x16) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) @@ -558,24 +513,20 @@ define dso_local void @tlsle_copy_i8_to_i8x32() nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8) -; LA32-NEXT: xvldrepl.b $xr0, $a0, 0 +; LA32-NEXT: xvldrepl.b $xr0, $a0, %le_lo12_r(g_i8) ; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8x32) ; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8x32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8x32) -; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: xvst $xr0, $a0, %le_lo12_r(g_i8x32) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_copy_i8_to_i8x32: ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8) -; LA64-NEXT: xvldrepl.b $xr0, $a0, 0 +; LA64-NEXT: xvldrepl.b $xr0, $a0, %le_lo12_r(g_i8) ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8x32) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8x32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8x32) -; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: xvst $xr0, $a0, %le_lo12_r(g_i8x32) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8) @@ -606,10 +557,9 @@ define dso_local void @tlsle_rmw() nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_rmw) ; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_rmw) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_rmw) -; LA64-NEXT: ld.d $a1, $a0, 0 +; LA64-NEXT: ld.d $a1, $a0, %le_lo12_r(g_rmw) ; LA64-NEXT: addi.d $a1, $a1, 1 -; LA64-NEXT: st.d $a1, $a0, 0 +; LA64-NEXT: st.d $a1, $a0, %le_lo12_r(g_rmw) ; LA64-NEXT: ret entry: %0 = call ptr @llvm.threadlocal.address.p0(ptr @g_rmw) @@ -624,22 +574,18 @@ entry: define dso_local void @tlsle_store_a32() nounwind { ; LA32-LABEL: tlsle_store_a32: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a32) -; LA32-NEXT: lu12i.w $a1, 1 -; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a32+4096) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a32+4096) ; LA32-NEXT: ori $a1, $zero, 1 -; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: st.w $a1, $a0, %le_lo12_r(g_a32+4096) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_store_a32: ; LA64: # %bb.0: # %entry -; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) -; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a32) +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a32+4096) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a32+4096) ; LA64-NEXT: ori $a1, $zero, 1 -; LA64-NEXT: stptr.w $a1, $a0, 4096 +; LA64-NEXT: st.w $a1, $a0, %le_lo12_r(g_a32+4096) ; LA64-NEXT: ret entry: store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4 @@ -681,29 +627,27 @@ entry: define dso_local void @tlsle_control_flow_with_mem_access() nounwind { ; LA32-LABEL: tlsle_control_flow_with_mem_access: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a32) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a32) -; LA32-NEXT: ld.w $a1, $a0, 4 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a32+4) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a32+4) +; LA32-NEXT: ld.w $a1, $a0, %le_lo12_r(g_a32+4) ; LA32-NEXT: ori $a2, $zero, 1 ; LA32-NEXT: blt $a1, $a2, .LBB25_2 ; LA32-NEXT: # %bb.1: # %if.then ; LA32-NEXT: ori $a1, $zero, 10 -; LA32-NEXT: st.w $a1, $a0, 4 +; LA32-NEXT: st.w $a1, $a0, %le_lo12_r(g_a32+4) ; LA32-NEXT: .LBB25_2: # %if.end ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_control_flow_with_mem_access: ; LA64: # %bb.0: # %entry -; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a32) -; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a32) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a32) -; LA64-NEXT: ld.w $a1, $a0, 4 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a32+4) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a32+4) +; LA64-NEXT: ld.w $a1, $a0, %le_lo12_r(g_a32+4) ; LA64-NEXT: ori $a2, $zero, 1 ; LA64-NEXT: blt $a1, $a2, .LBB25_2 ; LA64-NEXT: # %bb.1: # %if.then ; LA64-NEXT: ori $a1, $zero, 10 -; LA64-NEXT: st.w $a1, $a0, 4 +; LA64-NEXT: st.w $a1, $a0, %le_lo12_r(g_a32+4) ; LA64-NEXT: .LBB25_2: # %if.end ; LA64-NEXT: ret entry: @@ -724,18 +668,16 @@ if.end: define dso_local ptr @tlsle_load_addr_offset_1() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_1: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, 8 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+8) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+8) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+8) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_1: ; LA64: # %bb.0: # %entry -; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) -; LA64-NEXT: addi.d $a0, $a0, 8 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+8) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64+8) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64+8) ; LA64-NEXT: ret entry: ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1) @@ -744,20 +686,16 @@ entry: define dso_local ptr @tlsle_load_addr_offset_257() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_257: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, 2047 -; LA32-NEXT: addi.w $a0, $a0, 9 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+2056) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+2056) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+2056) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_257: ; LA64: # %bb.0: # %entry -; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) -; LA64-NEXT: addi.d $a0, $a0, 2047 -; LA64-NEXT: addi.d $a0, $a0, 9 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+2056) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64+2056) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64+2056) ; LA64-NEXT: ret entry: ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 257) @@ -766,19 +704,16 @@ entry: define dso_local ptr @tlsle_load_addr_offset_1048576() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_1048576: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: lu12i.w $a1, 2048 -; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+8388608) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+8388608) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+8388608) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_1048576: ; LA64: # %bb.0: # %entry -; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) -; LA64-NEXT: addu16i.d $a0, $a0, 128 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+8388608) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64+8388608) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64+8388608) ; LA64-NEXT: ret entry: ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048576) @@ -787,21 +722,16 @@ entry: define dso_local ptr @tlsle_load_addr_offset_1048577() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_1048577: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: lu12i.w $a1, 2048 -; LA32-NEXT: ori $a1, $a1, 8 -; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+8388616) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+8388616) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+8388616) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_1048577: ; LA64: # %bb.0: # %entry -; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) -; LA64-NEXT: addu16i.d $a0, $a0, 128 -; LA64-NEXT: addi.d $a0, $a0, 8 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+8388616) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64+8388616) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64+8388616) ; LA64-NEXT: ret entry: ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048577) @@ -810,20 +740,16 @@ entry: define dso_local ptr @tlsle_load_addr_offset_268432896() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_268432896: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: lu12i.w $a1, 524283 -; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+2147463168) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+2147463168) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+2147463168) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_268432896: ; LA64: # %bb.0: # %entry -; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) -; LA64-NEXT: lu12i.w $a1, 524283 -; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+2147463168) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64+2147463168) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64+2147463168) ; LA64-NEXT: ret entry: ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432896) @@ -832,22 +758,16 @@ entry: define dso_local ptr @tlsle_load_addr_offset_268432897() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_268432897: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: lu12i.w $a1, 524283 -; LA32-NEXT: ori $a1, $a1, 8 -; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+2147463176) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+2147463176) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+2147463176) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_268432897: ; LA64: # %bb.0: # %entry -; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64) -; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64) -; LA64-NEXT: lu12i.w $a1, 524283 -; LA64-NEXT: ori $a1, $a1, 8 -; LA64-NEXT: add.d $a0, $a0, $a1 +; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+2147463176) +; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_a64+2147463176) +; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_a64+2147463176) ; LA64-NEXT: ret entry: ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432897) @@ -877,11 +797,9 @@ entry: define dso_local ptr @tlsle_load_addr_offset_248792680471040() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_248792680471040: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: lu12i.w $a1, 502733 -; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+2059194368) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+2059194368) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+2059194368) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_248792680471040: @@ -900,12 +818,9 @@ entry: define dso_local ptr @tlsle_load_addr_offset_9380351707272() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_9380351707272: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: lu12i.w $a1, 279556 -; LA32-NEXT: ori $a1, $a1, 1088 -; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+1145062464) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+1145062464) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+1145062464) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_9380351707272: @@ -945,12 +860,9 @@ entry: define dso_local ptr @tlsle_load_addr_offset_614749556925924693() nounwind { ; LA32-LABEL: tlsle_load_addr_offset_614749556925924693: ; LA32: # %bb.0: # %entry -; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64) -; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64) -; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64) -; LA32-NEXT: lu12i.w $a1, 209666 -; LA32-NEXT: ori $a1, $a1, 2728 -; LA32-NEXT: add.w $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_a64+858794664) +; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_a64+858794664) +; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_a64+858794664) ; LA32-NEXT: ret ; ; LA64-LABEL: tlsle_load_addr_offset_614749556925924693: From 71ee257a1d3a3e09423132e36f526e032c0f3b93 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 10 Feb 2025 19:35:28 +0800 Subject: [PATCH 141/293] [RISCV][VLOPT] Precommit tests for opt info on passthrus. NFC Currently we are returning the wrong operand info for passthru operands. --- .../test/CodeGen/RISCV/rvv/vl-opt-op-info.mir | 216 ++++++++++++++++++ 1 file changed, 216 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir index 8ae48e0b27e1e..c6e8dd92f8458 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir @@ -93,6 +93,42 @@ body: | %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 4 /* e8 */, 0 ... --- +name: vwop_vv_vd_passthru_use +body: | + bb.0: + ; CHECK-LABEL: name: vwop_vv_vd_passthru_use + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVWADD_VV_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVWADD_VV_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- +name: vwop_vv_vd_passthru_use_incompatible_eew +body: | + bb.0: + ; CHECK-LABEL: name: vwop_vv_vd_passthru_use_incompatible_eew + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVWADD_VV_MF2 %x, $noreg, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVWADD_VV_MF2 %x, $noreg, $noreg, 1, 4 /* e16 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- +name: vwop_vv_vd_passthru_use_incompatible_emul +body: | + bb.0: + ; CHECK-LABEL: name: vwop_vv_vd_passthru_use_incompatible_emul + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVWADD_VV_MF4 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVWADD_VV_MF4 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- name: vwop_vv_vs2 body: | bb.0: @@ -183,6 +219,42 @@ body: | %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 4 /* e8 */, 0 ... --- +name: vwop_wv_vd_passthru_use +body: | + bb.0: + ; CHECK-LABEL: name: vwop_wv_vd_passthru_use + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVWADD_WV_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVWADD_WV_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- +name: vwop_wv_vd_passthru_use_incompatible_eew +body: | + bb.0: + ; CHECK-LABEL: name: vwop_wv_vd_passthru_use_incompatible_eew + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVWADD_WV_MF2 %x, $noreg, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVWADD_WV_MF2 %x, $noreg, $noreg, 1, 4 /* e16 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- +name: vwop_wv_vd_passthru_use_incompatible_emul +body: | + bb.0: + ; CHECK-LABEL: name: vwop_wv_vd_passthru_use_incompatible_emul + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVWADD_WV_MF4 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVWADD_WV_MF4 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- name: vwop_wv_vs2 body: | bb.0: @@ -483,6 +555,42 @@ body: | %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 ... --- +name: vnop_wv_vd_passthru_use +body: | + bb.0: + ; CHECK-LABEL: name: vnop_wv_vd_passthru_use + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVNSRL_WV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + %y:vr = PseudoVNSRL_WV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 +... +--- +name: vnop_wv_vd_passthru_use_incompatible_eew +body: | + bb.0: + ; CHECK-LABEL: name: vnop_wv_vd_passthru_use_incompatible_eew + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVNSRL_WV_M1 %x, $noreg, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + %y:vr = PseudoVNSRL_WV_M1 %x, $noreg, $noreg, 1, 4 /* e16 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- +name: vnop_wv_vd_passthru_use_unsupported_emul +body: | + bb.0: + ; CHECK-LABEL: name: vnop_wv_vd_passthru_use_unsupported_emul + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vr = PseudoVNSRL_WV_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + %y:vr = PseudoVNSRL_WV_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 +... +--- name: vnop_wv_vs2 body: | bb.0: @@ -1003,6 +1111,42 @@ body: | %y:vr = PseudoVMSEQ_VV_MF2 $noreg, %x, 1, 3 /* e8 */ ... --- +name: vmop_vv_passthru_use +body: | + bb.0: + ; CHECK-LABEL: name: vmop_vv_passthru_use + ; CHECK: %x:vrnov0 = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0 /* e8 */ + ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */ + ; CHECK-NEXT: %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e8 */ + %x:vrnov0 = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0 /* e1 */ + %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */ + %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e1 */ +... +--- +name: vmop_vv_passthru_use_incompatible_eew +body: | + bb.0: + ; CHECK-LABEL: name: vmop_vv_passthru_use_incompatible_eew + ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */ + ; CHECK-NEXT: %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e8 */ + %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */ + %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e1 */ +... +--- +name: vmop_vv_passthru_use_incompatible_emul +body: | + bb.0: + ; CHECK-LABEL: name: vmop_vv_passthru_use_incompatible_emul + ; CHECK: %x:vrnov0 = PseudoVMAND_MM_B16 $noreg, $noreg, -1, 0 /* e8 */ + ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */ + ; CHECK-NEXT: %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e8 */ + %x:vrnov0 = PseudoVMAND_MM_B16 $noreg, $noreg, -1, 0 /* e1 */ + %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */ + %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e1 */ +... +--- name: vmerge_vim body: | bb.0: @@ -1213,6 +1357,42 @@ body: | %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 ... --- +name: viota_m_dest_passthru_use +body: | + bb.0: + ; CHECK-LABEL: name: viota_m_dest_passthru_use + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVIOTA_M_M1 %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + %y:vr = PseudoVIOTA_M_M1 %x, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 +... +--- +name: viota_m_dest_passthru_use_incompatible_eew +body: | + bb.0: + ; CHECK-LABEL: name: viota_m_dest_passthru_use_incompatible_eew + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVIOTA_M_M1 %x, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + %y:vr = PseudoVIOTA_M_M1 %x, $noreg, 1, 4 /* e16 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- +name: viota_m_dest_passthru_use_incompatible_emul +body: | + bb.0: + ; CHECK-LABEL: name: viota_m_dest_passthru_use_incompatible_emul + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVIOTA_M_MF2 %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + %y:vr = PseudoVIOTA_M_MF2 %x, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 +... +--- name: viota_m_mask body: | bb.0: @@ -1467,6 +1647,42 @@ body: | %y:vr = PseudoVFREDMAX_VS_MF2_E32 $noreg, %x, %x, 1, 5 /* e32 */, 0 ... --- +name: vwred_passthru_use +body: | + bb.0: + ; CHECK-LABEL: name: vwred_passthru_use + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vr = PseudoVWREDSUM_VS_MF2_E8 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVWREDSUM_VS_MF2_E8 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- +name: vwred_passthru_use_incompatible_eew +body: | + bb.0: + ; CHECK-LABEL: name: vwred_passthru_use_incompatible_eew + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vr = PseudoVWREDSUM_VS_MF2_E8 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 + %y:vr = PseudoVWREDSUM_VS_MF2_E8 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- +name: vwred_passthru_use_incompatible_emul +body: | + bb.0: + ; CHECK-LABEL: name: vwred_passthru_use_incompatible_emul + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vr = PseudoVWREDSUM_VS_MF4_E8 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK-NEXT: %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVWREDSUM_VS_MF4_E8 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 + %z:vr = PseudoVADD_VV_MF2 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 +... +--- name: vfirst_v body: | bb.0: From 771f6b9f43039a4701a3ab76ac2456857ddf74ac Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 10 Feb 2025 19:55:22 +0800 Subject: [PATCH 142/293] [RISCV][VLOPT] Add support for Widening Floating-Point Fused Multiply-Add Instructions (#126485) We already had getOperandInfo support, so this marks the instructions as supported in isCandidate. It also adds support for vfwmaccbf16.v{v,f} from zvfbfwma --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 13 ++ llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll | 218 ++++++++++++++++++- 2 files changed, 227 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index d4829bced2470..6c19a8fd32d42 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -545,6 +545,8 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { case RISCV::VFWMSAC_VV: case RISCV::VFWNMSAC_VF: case RISCV::VFWNMSAC_VV: + case RISCV::VFWMACCBF16_VV: + case RISCV::VFWMACCBF16_VF: // Vector Widening Floating-Point Add/Subtract Instructions // Dest EEW=2*SEW. Source EEW=SEW. case RISCV::VFWADD_VV: @@ -1050,6 +1052,17 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VFMSUB_VF: case RISCV::VFNMSUB_VV: case RISCV::VFNMSUB_VF: + // Vector Widening Floating-Point Fused Multiply-Add Instructions + case RISCV::VFWMACC_VV: + case RISCV::VFWMACC_VF: + case RISCV::VFWNMACC_VV: + case RISCV::VFWNMACC_VF: + case RISCV::VFWMSAC_VV: + case RISCV::VFWMSAC_VF: + case RISCV::VFWNMSAC_VV: + case RISCV::VFWNMSAC_VF: + case RISCV::VFWMACCBF16_VV: + case RISCV::VFWMACCBF16_VF: // Vector Floating-Point MIN/MAX Instructions case RISCV::VFMIN_VF: case RISCV::VFMIN_VV: diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll index 053f1209cf214..f4591a191c8b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT -; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT -; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb,+zvfbfwma -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb,+zvfbfwma -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb,+zvfbfwma -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb,+zvfbfwma -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT ; The purpose of this file is to check the behavior of specific instructions as it relates to the VL optimizer @@ -4351,3 +4351,213 @@ define @vfnmsub_vf( %a, float %b, @llvm.riscv.vfadd( poison, %1, %c, iXLen 7, iXLen %vl) ret %2 } + +define @vfwmacc_vv( %a, %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwmacc_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma +; NOVLOPT-NEXT: vfwmacc.vv v8, v12, v14 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v16 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwmacc_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; VLOPT-NEXT: vfwmacc.vv v8, v12, v14 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v16 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwmacc( %a, %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwmacc_vf( %a, float %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwmacc_vf: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma +; NOVLOPT-NEXT: vfwmacc.vf v8, fa0, v12 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v16 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwmacc_vf: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; VLOPT-NEXT: vfwmacc.vf v8, fa0, v12 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v16 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwmacc( %a, float %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwnmacc_vv( %a, %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwnmacc_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma +; NOVLOPT-NEXT: vfwnmacc.vv v8, v12, v14 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v16 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwnmacc_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; VLOPT-NEXT: vfwnmacc.vv v8, v12, v14 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v16 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwnmacc( %a, %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwnmacc_vf( %a, float %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwnmacc_vf: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma +; NOVLOPT-NEXT: vfwnmacc.vf v8, fa0, v12 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v16 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwnmacc_vf: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; VLOPT-NEXT: vfwnmacc.vf v8, fa0, v12 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v16 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwnmacc( %a, float %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwmsac_vv( %a, %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwmsac_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma +; NOVLOPT-NEXT: vfwmsac.vv v8, v12, v14 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v16 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwmsac_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; VLOPT-NEXT: vfwmsac.vv v8, v12, v14 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v16 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwmsac( %a, %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwmsac_vf( %a, float %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwmsac_vf: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma +; NOVLOPT-NEXT: vfwmsac.vf v8, fa0, v12 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v16 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwmsac_vf: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; VLOPT-NEXT: vfwmsac.vf v8, fa0, v12 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v16 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwmsac( %a, float %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwnmsac_vv( %a, %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwnmsac_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma +; NOVLOPT-NEXT: vfwnmsac.vv v8, v12, v14 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v16 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwnmsac_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; VLOPT-NEXT: vfwnmsac.vv v8, v12, v14 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v16 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwnmsac( %a, %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwnmsac_vf( %a, float %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwnmsac_vf: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, tu, ma +; NOVLOPT-NEXT: vfwnmsac.vf v8, fa0, v12 +; NOVLOPT-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v16 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwnmsac_vf: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma +; VLOPT-NEXT: vfwnmsac.vf v8, fa0, v12 +; VLOPT-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v16 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwnmsac( %a, float %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwmaccbf16_vv( %a, %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwmaccbf16_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma +; NOVLOPT-NEXT: vfwmaccbf16.vv v8, v10, v11 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwmaccbf16_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; VLOPT-NEXT: vfwmaccbf16.vv v8, v10, v11 +; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwmaccbf16( %a, %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} + +define @vfwmaccbf16_vf( %a, bfloat %b, %c, %d, iXLen %vl) { +; NOVLOPT-LABEL: vfwmaccbf16_vf: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma +; NOVLOPT-NEXT: vfwmaccbf16.vf v8, fa0, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vfadd.vv v8, v8, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfwmaccbf16_vf: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; VLOPT-NEXT: vfwmaccbf16.vf v8, fa0, v10 +; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; VLOPT-NEXT: vfadd.vv v8, v8, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfwmaccbf16( %a, bfloat %b, %c, iXLen 7, iXLen -1, iXLen 0) + %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) + ret %2 +} From f796bc622a7725708b8ffbe0c7a684a8557e77a3 Mon Sep 17 00:00:00 2001 From: Rolf Morel Date: Mon, 10 Feb 2025 13:05:13 +0100 Subject: [PATCH 143/293] [MLIR][Linalg] Expose linalg.matmul and linalg.contract via Python API (#126377) Now that linalg.matmul is in tablegen, "hand write" the Python wrapper that OpDSL used to derive. Similarly, add a Python wrapper for the new linalg.contract op. Required following misc. fixes: 1) make linalg.matmul's parsing and printing consistent w.r.t. whether indexing_maps occurs before or after operands, i.e. per the tests cases it comes _before_. 2) tablegen for linalg.contract did not state it accepted an optional cast attr. 3) In ODS's C++-generating code, expand partial support for `$_builder` access in `Attr::defaultValue` to full support. This enables access to the current `MlirContext` when constructing the default value (as is required when the default value consists of affine maps). --- .../Dialect/Linalg/IR/LinalgStructuredOps.td | 8 +- mlir/include/mlir/IR/CommonAttrConstraints.td | 3 + mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 10 +- mlir/python/mlir/dialects/linalg/__init__.py | 46 ++++ mlir/test/Dialect/Linalg/named-ops.mlir | 16 +- mlir/test/python/dialects/linalg/ops.py | 210 ++++++++++++++++++ mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp | 31 ++- mlir/tools/mlir-tblgen/OpFormatGen.cpp | 22 +- mlir/tools/mlir-tblgen/RewriterGen.cpp | 2 +- 9 files changed, 316 insertions(+), 32 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 110ed7d2fc00e..29cb8035b583b 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -606,7 +606,10 @@ def MatmulOp : LinalgStructuredBase_Op<"matmul", [ let arguments = (ins Variadic:$inputs, Variadic:$outputs, - DefaultValuedOptionalAttr:$indexing_maps, + DefaultValuedOptionalAttr< + AffineMapArrayAttr, + "MatmulOp::getDefaultIndexingMaps($_builder.getContext())" + >:$indexing_maps, DefaultValuedOptionalAttr:$cast ); let results = (outs Variadic:$result_tensors); @@ -752,7 +755,8 @@ def ContractOp : LinalgStructuredBase_Op<"contract", [ let arguments = (ins Variadic:$inputs, Variadic:$outputs, - AffineMapArrayAttr:$indexing_maps + AffineMapArrayAttr:$indexing_maps, + DefaultValuedOptionalAttr:$cast ); let results = (outs Variadic:$result_tensors); // NB: The only reason this op has a region - and it get populated at op build diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td index 599f5ecba5803..2beb1e8110afe 100644 --- a/mlir/include/mlir/IR/CommonAttrConstraints.td +++ b/mlir/include/mlir/IR/CommonAttrConstraints.td @@ -50,6 +50,9 @@ class Attr : // Default value for attribute. // Requires a constBuilderCall defined. + // + // Format: `$_builder` will be expanded to the relevant builder, e.g. to allow + // access to the current context. string defaultValue = ?; // The value type of this attribute. This corresponds to the mlir::Type that diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index b50931f15826c..d40cec02df633 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -3666,11 +3666,6 @@ ParseResult MatmulOp::parse(OpAsmParser &parser, OperationState &result) { } void MatmulOp::print(OpAsmPrinter &p) { - SmallVector elidedAttrs = { - "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; - printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), - elidedAttrs); - SmallVector indexingMaps = llvm::map_to_vector( MatmulOp::getDefaultIndexingMaps(getContext()), [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); }); @@ -3680,6 +3675,11 @@ void MatmulOp::print(OpAsmPrinter &p) { [&](Attribute attr) { p.printAttribute(attr); }); p << "]"; } + + SmallVector elidedAttrs = { + "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; + printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), + elidedAttrs); } /// Verify the user defined indexing maps. diff --git a/mlir/python/mlir/dialects/linalg/__init__.py b/mlir/python/mlir/dialects/linalg/__init__.py index 742262a9c4969..5cda4769d593f 100644 --- a/mlir/python/mlir/dialects/linalg/__init__.py +++ b/mlir/python/mlir/dialects/linalg/__init__.py @@ -147,3 +147,49 @@ def __init__( generic = region_op(GenericOp_, terminator=YieldOp) + + +def matmul( + *ins: Union[Operation, OpView, Value], + outs: Sequence[Union[Operation, OpView, Value]], + indexing_maps: Optional[Sequence[AffineMapAttr]] = None, + cast: Optional[Union[TypeFn, Attribute]] = None, +): + ins = [_get_op_result_or_value(input) for input in ins] + if len(outs) > 1: + raise ValueError(f"{outs=} must have length 1.") + init = _get_op_result_or_value(outs[0]) + result_types = [init.type] if isinstance(init.type, RankedTensorType) else [] + + op = MatmulOp( + result_tensors=result_types, + inputs=ins, + outputs=[init], + indexing_maps=indexing_maps, + cast=cast, + ) + fill_builtin_region(op.operation) + return op + + +def contract( + *ins: Union[Operation, OpView, Value], + outs: Sequence[Union[Operation, OpView, Value]], + indexing_maps: Sequence[AffineMapAttr], + cast: Optional[Union[TypeFn, Attribute]] = None, +): + ins = [_get_op_result_or_value(input) for input in ins] + if len(outs) > 1: + raise ValueError(f"{outs=} must have length 1.") + init = _get_op_result_or_value(outs[0]) + result_types = [init.type] if isinstance(init.type, RankedTensorType) else [] + + op = ContractOp( + result_tensors=result_types, + inputs=ins, + outputs=[init], + indexing_maps=indexing_maps, + cast=cast, + ) + fill_builtin_region(op.operation) + return op diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index ed8683522c74a..68ea97be911a6 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1269,7 +1269,7 @@ func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5 // CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) // CHECK: return // CHECK: } @@ -1294,7 +1294,7 @@ func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7 // CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) // CHECK: return // CHECK: } @@ -1315,6 +1315,7 @@ func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: m // CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> // CHECK-LABEL: func @matmul_bcast_a // CHECK: linalg.matmul +// CHECK-SAME: indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>) // CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) @@ -1335,6 +1336,7 @@ func.func @matmul_bcast_a_dim1(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %ar // CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> // CHECK-LABEL: func @matmul_bcast_a_dim1 // CHECK: linalg.matmul +// CHECK-SAME: indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>) // CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) @@ -1355,6 +1357,7 @@ func.func @matmul_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: m // CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> // CHECK-LABEL: func @matmul_bcast_b // CHECK: linalg.matmul +// CHECK-SAME: indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>) // CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) @@ -1376,7 +1379,7 @@ func.func @matmul_bcast_a_b(%arg0: memref<5xf32>, %arg1: memref<5xf32>, %arg2: m // CHECK-LABEL: func.func @matmul_bcast_a_b( // CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, %[[VAL_1:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_1]]] +// CHECK: linalg.matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_1]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) // CHECK: return // CHECK: } @@ -1397,6 +1400,7 @@ func.func @matmul_bcast_b_dim1(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %ar // CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> // CHECK-LABEL: func @matmul_bcast_b_dim1 // CHECK: linalg.matmul +// CHECK-SAME: indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>) // CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) @@ -1420,7 +1424,7 @@ func.func @dynamic_matmul_bcast_a(%arg0: memref, %arg1: memref, // CHECK-SAME: %[[VAL_0:.*]]: memref, // CHECK-SAME: %[[VAL_1:.*]]: memref, // CHECK-SAME: %[[VAL_2:.*]]: memref) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref, memref) outs(%[[VAL_2]] : memref) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref, memref) outs(%[[VAL_2]] : memref) // CHECK: return // CHECK: } @@ -1444,7 +1448,7 @@ func.func @matmul_bcast_a_transpose_b(%arg0: memref<5xf32>, %arg1: memref<7x5xf3 // CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) // CHECK: return // CHECK: } @@ -1468,7 +1472,7 @@ func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf3 // CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) // CHECK: return // CHECK: } diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py index ac7186c24bed8..94f8ea4faf4a8 100644 --- a/mlir/test/python/dialects/linalg/ops.py +++ b/mlir/test/python/dialects/linalg/ops.py @@ -256,3 +256,213 @@ def f(a, b): module.operation.verify() print(module) + + +# CHECK-LABEL: TEST: testMatmulOp +@run +def testMatmulOp(): + with Context(), Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + a_shape = (4, 8) + b_shape = (8, 12) + b_transposed_shape = (12, 8) + c_shape = (4, 12) + + dimM = ir.AffineDimExpr.get(0) + dimN = ir.AffineDimExpr.get(1) + dimK = ir.AffineDimExpr.get(2) + + # CHECK: #[[$A_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)> + # CHECK: #[[$BTrans_MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)> + # CHECK: #[[$C_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)> + a_map = ir.AffineMap.get(3, 0, [dimM, dimK]) + b_map = ir.AffineMap.get(3, 0, [dimK, dimN]) + c_map = ir.AffineMap.get(3, 0, [dimM, dimN]) + b_transposed_map = ir.AffineMap.get(3, 0, [dimN, dimK]) + + # CHECK: func.func @matmul_op( + @func.FuncOp.from_py_func( + # CHECK-SAME: %[[A:.*]]: tensor<4x8xf32>, + RankedTensorType.get(a_shape, f32), + # CHECK-SAME: %[[Amem:.*]]: memref<4x8xf32>, + MemRefType.get(a_shape, f32), + # CHECK-SAME: %[[B:.*]]: tensor<8x12xf32>, + RankedTensorType.get(b_shape, f32), + # CHECK-SAME: %[[Bmem:.*]]: memref<8x12xf32>, + MemRefType.get(b_shape, f32), + # CHECK-SAME: %[[BTrans:.*]]: tensor<12x8xf32>, + RankedTensorType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[BTransmem:.*]]: memref<12x8xf32>, + MemRefType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[C:.*]]: tensor<4x12xf32>, + RankedTensorType.get(c_shape, f32), + # CHECK-SAME: %[[Cmem:.*]]: memref<4x12xf32>) + MemRefType.get(c_shape, f32), + ) + def matmul_op(A, Amem, B, Bmem, Btransposed, Btransposedmem, C, Cmem): + # CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<4x8xf32>, tensor<8x12xf32>) outs(%[[C]] : tensor<4x12xf32>) + res = linalg.MatmulOp( + result_tensors=(C.type,), + inputs=(A, B), + outputs=(C,), + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.matmul ins(%[[A]], %[[B]] : tensor<4x8xf32>, tensor<8x12xf32>) outs(%[[C]] : tensor<4x12xf32>) + res = linalg.matmul(A, B, outs=(C,)) + + # CHECK: linalg.matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<4x8xf32>, tensor<12x8xf32>) outs(%[[C]] : tensor<4x12xf32>) + res = linalg.MatmulOp( + result_tensors=(C.type,), + inputs=(A, Btransposed), + outputs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<4x8xf32>, tensor<12x8xf32>) outs(%[[C]] : tensor<4x12xf32>) + res = linalg.matmul( + A, + Btransposed, + outs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + # And now with memrefs... + + # CHECK: linalg.matmul ins(%[[Amem]], %[[Bmem]] : memref<4x8xf32>, memref<8x12xf32>) outs(%[[Cmem]] : memref<4x12xf32>) + res = linalg.MatmulOp( + result_tensors=[], + inputs=(Amem, Bmem), + outputs=(Cmem,), + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.matmul ins(%[[Amem]], %[[Bmem]] : memref<4x8xf32>, memref<8x12xf32>) outs(%[[Cmem]] : memref<4x12xf32>) + linalg.matmul(Amem, Bmem, outs=(Cmem,)) + + # CHECK: linalg.matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<4x8xf32>, memref<12x8xf32>) outs(%[[Cmem]] : memref<4x12xf32>) + res = linalg.MatmulOp( + result_tensors=[], + inputs=(Amem, Btransposedmem), + outputs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<4x8xf32>, memref<12x8xf32>) outs(%[[Cmem]] : memref<4x12xf32>) + linalg.matmul( + Amem, + Btransposedmem, + outs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + print(module) + + +# CHECK-LABEL: TEST: testContractOp +@run +def testContractOp(): + with Context(), Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + a_shape = (4, 8) + b_shape = (8, 12) + b_transposed_shape = (12, 8) + c_shape = (4, 12) + + dimM = ir.AffineDimExpr.get(0) + dimN = ir.AffineDimExpr.get(1) + dimK = ir.AffineDimExpr.get(2) + + # CHECK: #[[$A_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)> + # CHECK: #[[$B_MAP:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)> + # CHECK: #[[$C_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)> + # CHECK: #[[$BTrans_MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)> + a_map = ir.AffineMap.get(3, 0, [dimM, dimK]) + b_map = ir.AffineMap.get(3, 0, [dimK, dimN]) + c_map = ir.AffineMap.get(3, 0, [dimM, dimN]) + b_transposed_map = ir.AffineMap.get(3, 0, [dimN, dimK]) + + # CHECK: func.func @matmul_as_contract_op( + @func.FuncOp.from_py_func( + # CHECK-SAME: %[[A:.*]]: tensor<4x8xf32>, + RankedTensorType.get(a_shape, f32), + # CHECK-SAME: %[[Amem:.*]]: memref<4x8xf32>, + MemRefType.get(a_shape, f32), + # CHECK-SAME: %[[B:.*]]: tensor<8x12xf32>, + RankedTensorType.get(b_shape, f32), + # CHECK-SAME: %[[Bmem:.*]]: memref<8x12xf32>, + MemRefType.get(b_shape, f32), + # CHECK-SAME: %[[BTrans:.*]]: tensor<12x8xf32>, + RankedTensorType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[BTransmem:.*]]: memref<12x8xf32>, + MemRefType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[C:.*]]: tensor<4x12xf32>, + RankedTensorType.get(c_shape, f32), + # CHECK-SAME: %[[Cmem:.*]]: memref<4x12xf32>) + MemRefType.get(c_shape, f32), + ) + def matmul_as_contract_op( + A, Amem, B, Bmem, Btransposed, Btransposedmem, C, Cmem + ): + # CHECK: linalg.contract indexing_maps = [#[[$A_MAP]], #[[$B_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[B]] : tensor<4x8xf32>, tensor<8x12xf32>) outs(%[[C]] : tensor<4x12xf32>) + op4 = linalg.ContractOp( + result_tensors=(C.type,), + inputs=(A, B), + outputs=(C,), + indexing_maps=[a_map, b_map, c_map], + ) + linalg.fill_builtin_region(op4.operation) + # CHECK: linalg.contract indexing_maps = [#[[$A_MAP]], #[[$B_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[B]] : tensor<4x8xf32>, tensor<8x12xf32>) outs(%[[C]] : tensor<4x12xf32>) + op5 = linalg.contract( + A, B, outs=(C,), indexing_maps=[a_map, b_map, c_map] + ) + + # CHECK: linalg.contract indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<4x8xf32>, tensor<12x8xf32>) outs(%[[C]] : tensor<4x12xf32>) + op4 = linalg.ContractOp( + result_tensors=(C.type,), + inputs=(A, Btransposed), + outputs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(op4.operation) + # CHECK: linalg.contract indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<4x8xf32>, tensor<12x8xf32>) outs(%[[C]] : tensor<4x12xf32>) + op5 = linalg.contract( + A, + Btransposed, + outs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + # And now with memrefs... + + # CHECK: linalg.contract indexing_maps = [#[[$A_MAP]], #[[$B_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[Bmem]] : memref<4x8xf32>, memref<8x12xf32>) outs(%[[Cmem]] : memref<4x12xf32>) + op4 = linalg.ContractOp( + result_tensors=[], + inputs=(Amem, Bmem), + outputs=(Cmem,), + indexing_maps=[a_map, b_map, c_map], + ) + linalg.fill_builtin_region(op4.operation) + # CHECK: linalg.contract indexing_maps = [#[[$A_MAP]], #[[$B_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[Bmem]] : memref<4x8xf32>, memref<8x12xf32>) outs(%[[Cmem]] : memref<4x12xf32>) + linalg.contract( + Amem, Bmem, outs=(Cmem,), indexing_maps=[a_map, b_map, c_map] + ) + + # CHECK: linalg.contract indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<4x8xf32>, memref<12x8xf32>) outs(%[[Cmem]] : memref<4x12xf32>) + op4 = linalg.ContractOp( + result_tensors=[], + inputs=(Amem, Btransposedmem), + outputs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(op4.operation) + # CHECK: linalg.contract indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<4x8xf32>, memref<12x8xf32>) outs(%[[Cmem]] : memref<4x12xf32>) + linalg.contract( + Amem, + Btransposedmem, + outs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + print(module) diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index a970cbc5caceb..629e863dac5e3 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -1334,8 +1334,9 @@ static void emitAttrGetterWithReturnType(FmtContext &fctx, PrintFatalError("DefaultValuedAttr of type " + attr.getAttrDefName() + " must have a constBuilder"); } - std::string defaultValue = std::string( - tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue())); + std::string defaultValue = + std::string(tgfmt(attr.getConstBuilderTemplate(), &fctx, + tgfmt(attr.getDefaultValue(), &fctx))); body << " if (!attr)\n return " << tgfmt(attr.getConvertFromStorageCall(), &fctx.withSelf(defaultValue)) @@ -1467,6 +1468,7 @@ void OpEmitter::genPropertiesSupport() { os << " if (!attr) attr = dict.get(\"result_segment_sizes\");"; } + fctx.withBuilder(odsBuilder); setPropMethod << "{\n" << formatv(propFromAttrFmt, tgfmt(prop.getConvertFromAttributeCall(), @@ -1479,7 +1481,7 @@ void OpEmitter::genPropertiesSupport() { prop.getStorageTypeValueOverride()); } else if (prop.hasDefaultValue()) { setPropMethod << formatv(attrGetDefaultFmt, name, - prop.getDefaultValue()); + tgfmt(prop.getDefaultValue(), &fctx)); } else { setPropMethod << formatv(attrGetNoDefaultFmt, name); } @@ -2919,6 +2921,9 @@ getBuilderSignature(const Builder &builder) { arguments.emplace_back("::mlir::OpBuilder &", odsBuilder); arguments.emplace_back("::mlir::OperationState &", builderOpState); + FmtContext fctx; + fctx.withBuilder(odsBuilder); + for (unsigned i = 0, e = params.size(); i < e; ++i) { // If no name is provided, generate one. std::optional paramName = params[i].getName(); @@ -2931,7 +2936,7 @@ getBuilderSignature(const Builder &builder) { defaultValue = *defaultParamValue; arguments.emplace_back(params[i].getCppType(), std::move(name), - defaultValue); + tgfmt(defaultValue, &fctx)); } return arguments; @@ -3189,6 +3194,9 @@ void OpEmitter::buildParamList(SmallVectorImpl ¶mList, } } + FmtContext fctx; + fctx.withBuilder(odsBuilder); + for (int i = 0, e = op.getNumArgs(), numOperands = 0; i < e; ++i) { Argument arg = op.getArg(i); if (const auto *operand = @@ -3210,7 +3218,7 @@ void OpEmitter::buildParamList(SmallVectorImpl ¶mList, StringRef type = prop.getInterfaceType(); std::string defaultValue; if (prop.hasDefaultValue() && i >= defaultValuedAttrLikeStartIndex) { - defaultValue = prop.getDefaultValue(); + defaultValue = tgfmt(prop.getDefaultValue(), &fctx); } bool isOptional = prop.hasDefaultValue(); paramList.emplace_back(type, propArg->name, StringRef(defaultValue), @@ -3242,7 +3250,7 @@ void OpEmitter::buildParamList(SmallVectorImpl ¶mList, if (i >= defaultValuedAttrStartIndex) { if (attrParamKind == AttrParamKind::UnwrappedValue && canUseUnwrappedRawValue(attr)) - defaultValue += attr.getDefaultValue(); + defaultValue += tgfmt(attr.getDefaultValue(), &fctx); else defaultValue += "nullptr"; } @@ -4172,6 +4180,9 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter( staticVerifierEmitter(staticVerifierEmitter), emitHelper(op, /*emitForOp=*/false) { + FmtContext fctx; + fctx.withBuilder(odsBuilder); + genericAdaptorBase.declare(Visibility::Public); bool useProperties = emitHelper.hasProperties(); if (useProperties) { @@ -4212,7 +4223,7 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter( if (prop.hasStorageTypeValueOverride()) os << " = " << prop.getStorageTypeValueOverride(); else if (prop.hasDefaultValue()) - os << " = " << prop.getDefaultValue(); + os << " = " << tgfmt(prop.getDefaultValue(), &fctx); comparatorOs << " rhs." << name << " == this->" << name << " &&\n"; // Emit accessors using the interface type. @@ -4454,7 +4465,6 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter( if (auto *m = genericAdaptor.addMethod("RangeT", "getOperands")) m->body() << " return odsOperands;"; - FmtContext fctx; fctx.withBuilder("::mlir::Builder(odsAttrs.getContext())"); // Generate named accessor with Attribute return type. @@ -4481,8 +4491,9 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter( // Use the default value if attribute is not set. // TODO: this is inefficient, we are recreating the attribute for every // call. This should be set instead. - std::string defaultValue = std::string( - tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue())); + std::string defaultValue = + std::string(tgfmt(attr.getConstBuilderTemplate(), &fctx, + tgfmt(attr.getDefaultValue(), &fctx))); body << "if (!attr)\n attr = " << defaultValue << ";\n"; } body << "return attr;\n"; diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index f03a3bfd398ed..fe724e86d6707 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -1999,7 +1999,7 @@ static void genNonDefaultValueCheck(MethodBody &body, const Operator &op, fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())"); body << getter << "Attr() != " << tgfmt(attr.getConstBuilderTemplate(), &fctx, - attr.getDefaultValue()); + tgfmt(attr.getDefaultValue(), &fctx)); } if (optionalAndDefault) body << ")"; @@ -2007,8 +2007,10 @@ static void genNonDefaultValueCheck(MethodBody &body, const Operator &op, static void genNonDefaultValueCheck(MethodBody &body, const Operator &op, PropertyVariable &propElement) { - body << op.getGetterName(propElement.getVar()->name) - << "() != " << propElement.getVar()->prop.getDefaultValue(); + FmtContext fctx; + fctx.withBuilder("::mlir::OpBuilder((*this)->getContext())"); + body << op.getGetterName(propElement.getVar()->name) << "() != " + << tgfmt(propElement.getVar()->prop.getDefaultValue(), &fctx); } /// Elide the variadic segment size attributes if necessary. @@ -2045,8 +2047,9 @@ static void genPropDictPrinter(OperationFormat &fmt, Operator &op, const StringRef &name = namedAttr.name; FmtContext fctx; fctx.withBuilder("odsBuilder"); - std::string defaultValue = std::string( - tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue())); + std::string defaultValue = + std::string(tgfmt(attr.getConstBuilderTemplate(), &fctx, + tgfmt(attr.getDefaultValue(), &fctx))); body << " {\n"; body << " ::mlir::Builder odsBuilder(getContext());\n"; body << " ::mlir::Attribute attr = " << op.getGetterName(name) @@ -2059,8 +2062,10 @@ static void genPropDictPrinter(OperationFormat &fmt, Operator &op, // Similarly, elide default-valued properties. for (const NamedProperty &prop : op.getProperties()) { if (prop.prop.hasDefaultValue()) { + FmtContext fctx; + fctx.withBuilder("odsBuilder"); body << " if (" << op.getGetterName(prop.name) - << "() == " << prop.prop.getDefaultValue() << ") {"; + << "() == " << tgfmt(prop.prop.getDefaultValue(), &fctx) << ") {"; body << " elidedProps.push_back(\"" << prop.name << "\");\n"; body << " }\n"; } @@ -2094,8 +2099,9 @@ static void genAttrDictPrinter(OperationFormat &fmt, Operator &op, const StringRef &name = namedAttr.name; FmtContext fctx; fctx.withBuilder("odsBuilder"); - std::string defaultValue = std::string( - tgfmt(attr.getConstBuilderTemplate(), &fctx, attr.getDefaultValue())); + std::string defaultValue = + std::string(tgfmt(attr.getConstBuilderTemplate(), &fctx, + tgfmt(attr.getDefaultValue(), &fctx))); body << " {\n"; body << " ::mlir::Builder odsBuilder(getContext());\n"; body << " ::mlir::Attribute attr = " << op.getGetterName(name) diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp index a041c4d327779..f6eb5bdfe568e 100644 --- a/mlir/tools/mlir-tblgen/RewriterGen.cpp +++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp @@ -879,7 +879,7 @@ void PatternEmitter::emitAttributeMatch(DagNode tree, StringRef opName, if (attr.hasDefaultValue()) { os << "if (!tblgen_attr) tblgen_attr = " << std::string(tgfmt(attr.getConstBuilderTemplate(), &fmtCtx, - attr.getDefaultValue())) + tgfmt(attr.getDefaultValue(), &fmtCtx))) << ";\n"; } else if (attr.isOptional()) { // For a missing attribute that is optional according to definition, we From 729416e586fba71b4f63d71b1b5c765aefbf200b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Mon, 10 Feb 2025 13:25:07 +0100 Subject: [PATCH 144/293] [analyzer][NFC] Remove "V2" from ArrayBoundCheckerV2.cpp (#126094) Previously commit 6e17ed9b04e5523cc910bf171c3122dcc64b86db deleted the obsolete checker `alpha.security.ArrayBound` which was implemented in `ArrayBoundChecker.cpp` and renamed the checker `alpha.security.ArrayBoundV2` to `security.ArrayBound`. This commit concludes that consolidation by renaming the source file `ArrayBoundCheckerV2.cpp` to `ArrayBoundChecker.cpp` (which was "freed up" by the previous commit). --- ...{ArrayBoundCheckerV2.cpp => ArrayBoundChecker.cpp} | 11 +++-------- clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt | 2 +- .../clang/lib/StaticAnalyzer/Checkers/BUILD.gn | 2 +- 3 files changed, 5 insertions(+), 10 deletions(-) rename clang/lib/StaticAnalyzer/Checkers/{ArrayBoundCheckerV2.cpp => ArrayBoundChecker.cpp} (98%) diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp similarity index 98% rename from clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp rename to clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp index 6f8d6dbd573f4..109faacf1726a 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp @@ -1,4 +1,4 @@ -//== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==// +//== ArrayBoundChecker.cpp -------------------------------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,12 +11,6 @@ // //===----------------------------------------------------------------------===// -// NOTE: The name of this file ends with "V2" because previously -// "ArrayBoundChecker.cpp" contained the implementation of another (older and -// simpler) checker that was called `alpha.security.ArrayBound`. -// TODO: Rename this file to "ArrayBoundChecker.cpp" when it won't be confused -// with that older file. - #include "clang/AST/CharUnits.h" #include "clang/AST/ParentMapContext.h" #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" @@ -297,7 +291,8 @@ static std::pair compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, SValBuilder &SVB, bool CheckEquality = false) { if (auto ConcreteThreshold = Threshold.getAs()) { - std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB); + std::tie(Value, Threshold) = + getSimplifiedOffsets(Value, *ConcreteThreshold, SVB); } // We want to perform a _mathematical_ comparison between the numbers `Value` diff --git a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt index ccff5d0ac3b96..5910043440987 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt +++ b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt @@ -7,7 +7,7 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangStaticAnalyzerCheckers AnalysisOrderChecker.cpp AnalyzerStatsChecker.cpp - ArrayBoundCheckerV2.cpp + ArrayBoundChecker.cpp BasicObjCFoundationChecks.cpp BitwiseShiftChecker.cpp BlockInCriticalSectionChecker.cpp diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn index c1bba99be3ba5..d9c3257536639 100644 --- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn @@ -16,7 +16,7 @@ static_library("Checkers") { sources = [ "AnalysisOrderChecker.cpp", "AnalyzerStatsChecker.cpp", - "ArrayBoundCheckerV2.cpp", + "ArrayBoundChecker.cpp", "BasicObjCFoundationChecks.cpp", "BitwiseShiftChecker.cpp", "BlockInCriticalSectionChecker.cpp", From 83fa117f76f9c4c82ce0ca914c4eba268c6c2fa2 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Mon, 10 Feb 2025 10:11:28 -0300 Subject: [PATCH 145/293] [RISCV] Add cost model for fma (#126076) This change builds on PR #125683, which added a cost model for fmuladd. To ensure completeness, this patch extends the cost model to also cover fma, using the same costing approach as fmuladd. I plan to send a follow-up patch that includes the cost model vp_fma and vp_fmuladd, and their tests. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 1 + .../test/Analysis/CostModel/RISCV/arith-fp.ll | 78 ++++++++----------- 2 files changed, 32 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index b3ddd07902a5c..c46400a1936ad 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1176,6 +1176,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } break; } + case Intrinsic::fma: case Intrinsic::fmuladd: { // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin auto LT = getTypeLegalizationCost(RetTy); diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll index 0928935b43db2..852c237cf2501 100644 --- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll @@ -1381,30 +1381,30 @@ define void @fma() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV1BF16 = call @llvm.fma.nxv1bf16( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV2BF16 = call @llvm.fma.nxv2bf16( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV4BF16 = call @llvm.fma.nxv4bf16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV8BF16 = call @llvm.fma.nxv8bf16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV16BF16 = call @llvm.fma.nxv16bf16( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV8BF16 = call @llvm.fma.nxv8bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV16BF16 = call @llvm.fma.nxv16bf16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.fma.v1f32(<1 x float> undef, <1 x float> undef, <1 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV1F32 = call @llvm.fma.nxv1f32( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV2F32 = call @llvm.fma.nxv2f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV4F32 = call @llvm.fma.nxv4f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV8F32 = call @llvm.fma.nxv8f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV16F32 = call @llvm.fma.nxv16f32( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <1 x double> @llvm.fma.v1f64(<1 x double> undef, <1 x double> undef, <1 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV4F32 = call @llvm.fma.nxv4f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV8F32 = call @llvm.fma.nxv8f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV16F32 = call @llvm.fma.nxv16f32( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.fma.v1f64(<1 x double> undef, <1 x double> undef, <1 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV1F64 = call @llvm.fma.nxv1f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV2F64 = call @llvm.fma.nxv2f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV4F64 = call @llvm.fma.nxv4f64( undef, undef, undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV8F64 = call @llvm.fma.nxv8f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV2F64 = call @llvm.fma.nxv2f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV4F64 = call @llvm.fma.nxv4f64( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV8F64 = call @llvm.fma.nxv8f64( undef, undef, undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) @@ -1449,37 +1449,21 @@ define void @fma() { } define void @fma_f16() { -; ZVFH-LABEL: 'fma_f16' -; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <1 x half> @llvm.fma.v1f16(<1 x half> undef, <1 x half> undef, <1 x half> undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call @llvm.fma.nxv1f16( undef, undef, undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call @llvm.fma.nxv2f16( undef, undef, undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call @llvm.fma.nxv4f16( undef, undef, undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV8F16 = call @llvm.fma.nxv8f16( undef, undef, undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV16F16 = call @llvm.fma.nxv16f16( undef, undef, undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV32F16 = call @llvm.fma.nxv32f16( undef, undef, undef) -; ZVFH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; -; ZVFHMIN-LABEL: 'fma_f16' -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.fma.v1f16(<1 x half> undef, <1 x half> undef, <1 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call @llvm.fma.nxv1f16( undef, undef, undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call @llvm.fma.nxv2f16( undef, undef, undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call @llvm.fma.nxv4f16( undef, undef, undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV8F16 = call @llvm.fma.nxv8f16( undef, undef, undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV16F16 = call @llvm.fma.nxv16f16( undef, undef, undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV32F16 = call @llvm.fma.nxv32f16( undef, undef, undef) -; ZVFHMIN-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-LABEL: 'fma_f16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.fma.f16(half undef, half undef, half undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.fma.v1f16(<1 x half> undef, <1 x half> undef, <1 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = call <8 x half> @llvm.fma.v8f16(<8 x half> undef, <8 x half> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32F16 = call <32 x half> @llvm.fma.v32f16(<32 x half> undef, <32 x half> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV1F16 = call @llvm.fma.nxv1f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV2F16 = call @llvm.fma.nxv2f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NXV4F16 = call @llvm.fma.nxv4f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = call @llvm.fma.nxv8f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NXV16F16 = call @llvm.fma.nxv16f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NXV32F16 = call @llvm.fma.nxv32f16( undef, undef, undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %F16 = call half @llvm.fma.f16(half undef, half undef, half undef) From 121e6abefd9cd0276d04df32df1da3604c044cdf Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 10 Feb 2025 13:22:42 +0000 Subject: [PATCH 146/293] [X86] IsElementEquivalent - pull out repeated getValueType calls. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 34ac4262beb85..995b4de12ce12 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9789,6 +9789,7 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode()) return false; + EVT VT = Op.getValueType(); switch (Op.getOpcode()) { case ISD::BUILD_VECTOR: // If the values are build vectors, we can look through them to find @@ -9800,9 +9801,8 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, break; case X86ISD::VBROADCAST: case X86ISD::VBROADCAST_LOAD: - // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()? - return (Op == ExpectedOp && - (int)Op.getValueType().getVectorNumElements() == MaskSize); + // TODO: Handle MaskSize != VT.getVectorNumElements()? + return (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize); case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: @@ -9813,7 +9813,6 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, // TODO: Handle MaskSize != NumElts? // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases. if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) { - MVT VT = Op.getSimpleValueType(); int NumElts = VT.getVectorNumElements(); if (MaskSize == NumElts) { int NumLanes = VT.getSizeInBits() / 128; From bde8ce6a5c47a3e5719618797cc4143db6f871f5 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 10 Feb 2025 08:24:50 -0500 Subject: [PATCH 147/293] [AMDGPU] Only run `AMDGPUPrintfRuntimeBindingPass` at non-prelink phase (#125162) --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 ++- llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c6d36fde9730a..e8afbfbb411c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -803,7 +803,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerPipelineEarlySimplificationEPCallback( [](ModulePassManager &PM, OptimizationLevel Level, ThinOrFullLTOPhase Phase) { - PM.addPass(AMDGPUPrintfRuntimeBindingPass()); + if (!isLTOPreLink(Phase)) + PM.addPass(AMDGPUPrintfRuntimeBindingPass()); if (Level == OptimizationLevel::O0) return; diff --git a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll index c68143f44866f..b1fc76f457ece 100644 --- a/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll +++ b/llvm/test/CodeGen/AMDGPU/print-pipeline-passes.ll @@ -14,6 +14,7 @@ ; PRE-NOT: internalize ; PRE-NOT: amdgpu-attributor +; PRE-NOT: printfToRuntime define amdgpu_kernel void @kernel() { entry: From 199c791a1dbf417fdb08fbbb054d51ed398f285a Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 10 Feb 2025 14:28:40 +0100 Subject: [PATCH 148/293] [clang][bytecode] Support partial initializers for CXXNewExprs (#126494) For `new A[N]{1,2,3}`, we need to allocate N elements of type A, and initialize the first three with the given InitListExpr elements. However, if N is larger than 3, we need to initialize the remaining elements with the InitListExpr array filler. Similarly, for `new A[N];`, we need to initilize all fields with the constructor of A. The initializer type is a CXXConstructExpr of IncompleteArrayType in this case, which we can't generally handle. --- clang/lib/AST/ByteCode/Compiler.cpp | 121 ++++++++++++++++++++++++- clang/lib/AST/ByteCode/Interp.h | 5 +- clang/test/AST/ByteCode/new-delete.cpp | 57 +++++++++++- 3 files changed, 172 insertions(+), 11 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 1f0e022edcd76..86a3773d74d05 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -3370,15 +3370,23 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { PrimType SizeT = classifyPrim(Stripped->getType()); + // Save evaluated array size to a variable. + unsigned ArrayLen = allocateLocalPrimitive( + Stripped, SizeT, /*IsConst=*/false, /*IsExtended=*/false); + if (!this->visit(Stripped)) + return false; + if (!this->emitSetLocal(SizeT, ArrayLen, E)) + return false; + if (PlacementDest) { if (!this->visit(PlacementDest)) return false; - if (!this->visit(Stripped)) + if (!this->emitGetLocal(SizeT, ArrayLen, E)) return false; if (!this->emitCheckNewTypeMismatchArray(SizeT, E, E)) return false; } else { - if (!this->visit(Stripped)) + if (!this->emitGetLocal(SizeT, ArrayLen, E)) return false; if (ElemT) { @@ -3392,10 +3400,113 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { } } - if (Init && !this->visitInitializer(Init)) - return false; + if (Init) { + QualType InitType = Init->getType(); + size_t StaticInitElems = 0; + const Expr *DynamicInit = nullptr; + if (const ConstantArrayType *CAT = + Ctx.getASTContext().getAsConstantArrayType(InitType)) { + StaticInitElems = CAT->getZExtSize(); + if (!this->visitInitializer(Init)) + return false; - } else { + if (const auto *ILE = dyn_cast(Init); + ILE && ILE->hasArrayFiller()) + DynamicInit = ILE->getArrayFiller(); + } + + // The initializer initializes a certain number of elements, S. + // However, the complete number of elements, N, might be larger than that. + // In this case, we need to get an initializer for the remaining elements. + // There are to cases: + // 1) For the form 'new Struct[n];', the initializer is a + // CXXConstructExpr and its type is an IncompleteArrayType. + // 2) For the form 'new Struct[n]{1,2,3}', the initializer is an + // InitListExpr and the initializer for the remaining elements + // is the array filler. + + if (DynamicInit || InitType->isIncompleteArrayType()) { + const Function *CtorFunc = nullptr; + if (const auto *CE = dyn_cast(Init)) { + CtorFunc = getFunction(CE->getConstructor()); + if (!CtorFunc) + return false; + } + + LabelTy EndLabel = this->getLabel(); + LabelTy StartLabel = this->getLabel(); + + // In the nothrow case, the alloc above might have returned nullptr. + // Don't call any constructors that case. + if (IsNoThrow) { + if (!this->emitDupPtr(E)) + return false; + if (!this->emitNullPtr(0, nullptr, E)) + return false; + if (!this->emitEQPtr(E)) + return false; + if (!this->jumpTrue(EndLabel)) + return false; + } + + // Create loop variables. + unsigned Iter = allocateLocalPrimitive( + Stripped, SizeT, /*IsConst=*/false, /*IsExtended=*/false); + if (!this->emitConst(StaticInitElems, SizeT, E)) + return false; + if (!this->emitSetLocal(SizeT, Iter, E)) + return false; + + this->fallthrough(StartLabel); + this->emitLabel(StartLabel); + // Condition. Iter < ArrayLen? + if (!this->emitGetLocal(SizeT, Iter, E)) + return false; + if (!this->emitGetLocal(SizeT, ArrayLen, E)) + return false; + if (!this->emitLT(SizeT, E)) + return false; + if (!this->jumpFalse(EndLabel)) + return false; + + // Pointer to the allocated array is already on the stack. + if (!this->emitGetLocal(SizeT, Iter, E)) + return false; + if (!this->emitArrayElemPtr(SizeT, E)) + return false; + + if (DynamicInit) { + if (std::optional InitT = classify(DynamicInit)) { + if (!this->visit(DynamicInit)) + return false; + if (!this->emitStorePop(*InitT, E)) + return false; + } else { + if (!this->visitInitializer(DynamicInit)) + return false; + if (!this->emitPopPtr(E)) + return false; + } + } else { + assert(CtorFunc); + if (!this->emitCall(CtorFunc, 0, E)) + return false; + } + + // ++Iter; + if (!this->emitGetPtrLocal(Iter, E)) + return false; + if (!this->emitIncPop(SizeT, E)) + return false; + + if (!this->jump(StartLabel)) + return false; + + this->fallthrough(EndLabel); + this->emitLabel(EndLabel); + } + } + } else { // Non-array. if (PlacementDest) { if (!this->visit(PlacementDest)) return false; diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 66fd31feb24f4..5cc371c7ee495 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1484,7 +1484,10 @@ bool InitThisBitField(InterpState &S, CodePtr OpPC, const Record::Field *F, template ::T> bool InitField(InterpState &S, CodePtr OpPC, uint32_t I) { const T &Value = S.Stk.pop(); - const Pointer &Field = S.Stk.peek().atField(I); + const Pointer &Ptr = S.Stk.peek(); + if (!CheckRange(S, OpPC, Ptr, CSK_Field)) + return false; + const Pointer &Field = Ptr.atField(I); Field.deref() = Value; Field.activate(); Field.initialize(); diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index a8f073aa03fc1..e60ff894c9715 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -268,11 +268,10 @@ namespace NowThrowNew { delete[] p; return result; } - /// This needs support for CXXConstrucExprs with non-constant array sizes. - static_assert(erroneous_array_bound_nothrow2(3)); // expected-error {{not an integral constant expression}} - static_assert(erroneous_array_bound_nothrow2(0));// expected-error {{not an integral constant expression}} - static_assert(erroneous_array_bound_nothrow2(-1) == 0);// expected-error {{not an integral constant expression}} - static_assert(!erroneous_array_bound_nothrow2(1LL << 62));// expected-error {{not an integral constant expression}} + static_assert(erroneous_array_bound_nothrow2(3)); + static_assert(erroneous_array_bound_nothrow2(0)); + static_assert(erroneous_array_bound_nothrow2(-1) == 0); + static_assert(!erroneous_array_bound_nothrow2(1LL << 62)); constexpr bool erroneous_array_bound(long long n) { delete[] new int[n]; // both-note {{array bound -1 is negative}} both-note {{array bound 4611686018427387904 is too large}} @@ -857,6 +856,54 @@ struct SS { }; constexpr unsigned short ssmall = SS(100)[42]; + + +namespace IncompleteArray { + struct A { + int b = 10; + }; + constexpr int test1() { + int n = 5; + int* a = new int[n]; + int c = a[0]; // both-note {{read of uninitialized object}} + delete[] a; + return c; + } + static_assert(test1() == 10); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} + + constexpr int test2() { + int n = 0; + int* a = new int[n]; + delete[] a; + return 10; + } + static_assert(test2() == 10); + + /// In this case, the type of the initializer is A[2], while the full size of the + /// allocated array is of course 5. The remaining 3 elements need to be initialized + /// using A's constructor. + constexpr int test3() { + int n = 3; + A* a = new A[n]{5, 1}; + int c = a[0].b + a[1].b + a[2].b; + delete[] a; + return c; + } + static_assert(test3() == (5 + 1 + 10)); + + constexpr int test4() { + auto n = 3; + int *a = new int[n]{12}; + int c = a[0] + a[1]; + delete[] a; + return c; + } + static_assert(test4() == 12); + + +} + #else /// Make sure we reject this prior to C++20 constexpr int a() { // both-error {{never produces a constant expression}} From af2a228e0b5c9fbfa02f37f1be10800b17509617 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 10 Feb 2025 21:30:05 +0800 Subject: [PATCH 149/293] [RISCV][VLOPT] Fix passthru operand info for mixed-width instructions (#126504) After #124066 we started allowing users that are passthrus. However for widening/narrowing instructions we were returning the wrong operand info for passthru operands since it originally assumed the operand would never be a passthru. This fixes it by handling it in IsMODef. --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 3 ++- llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 6c19a8fd32d42..28f89df9554b5 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -208,7 +208,8 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc()); const bool IsTied = RISCVII::isTiedPseudo(MI.getDesc().TSFlags); - bool IsMODef = MO.getOperandNo() == 0; + bool IsMODef = MO.getOperandNo() == 0 || + (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs()); // All mask operands have EEW=1 if (isMaskOperand(MI, MO, MRI)) diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir index c6e8dd92f8458..d2906c4613295 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir @@ -97,7 +97,7 @@ name: vwop_vv_vd_passthru_use body: | bb.0: ; CHECK-LABEL: name: vwop_vv_vd_passthru_use - ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ ; CHECK-NEXT: early-clobber %y:vr = PseudoVWADD_VV_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 @@ -223,7 +223,7 @@ name: vwop_wv_vd_passthru_use body: | bb.0: ; CHECK-LABEL: name: vwop_wv_vd_passthru_use - ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ ; CHECK-NEXT: early-clobber %y:vr = PseudoVWADD_WV_MF2 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 4 /* e16 */, 0 /* tu, mu */ %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 @@ -1115,7 +1115,7 @@ name: vmop_vv_passthru_use body: | bb.0: ; CHECK-LABEL: name: vmop_vv_passthru_use - ; CHECK: %x:vrnov0 = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0 /* e8 */ + ; CHECK: %x:vrnov0 = PseudoVMAND_MM_B8 $noreg, $noreg, 1, 0 /* e8 */ ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */ ; CHECK-NEXT: %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e8 */ %x:vrnov0 = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0 /* e1 */ @@ -1127,7 +1127,7 @@ name: vmop_vv_passthru_use_incompatible_eew body: | bb.0: ; CHECK-LABEL: name: vmop_vv_passthru_use_incompatible_eew - ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */ ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */ ; CHECK-NEXT: %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e8 */ %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 From c6b13a28717455028bf48bcb20f723ad3bbff783 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 10 Feb 2025 13:31:18 +0000 Subject: [PATCH 150/293] Revert "SCEV: teach isImpliedViaOperations about samesign" (#126506) The commit f5d24e6c is buggy, and following miscompiles have been reported: #126409 and https://github.com/llvm/llvm-project/pull/124270#issuecomment-2647222903 Revert it while we investigate. --- llvm/lib/Analysis/ScalarEvolution.cpp | 33 ++-- .../ScalarEvolution/exit-count-samesign.ll | 4 +- .../ScalarEvolution/implied-via-division.ll | 164 ++++-------------- .../IndVarSimplify/iv-ext-samesign.ll | 24 +-- 4 files changed, 59 insertions(+), 166 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 46a5c44f4e41a..573b052aa4b2c 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -11860,13 +11860,15 @@ bool ScalarEvolution::isImpliedCondBalancedTypes( } // Check whether the found predicate is the same as the desired predicate. - if (auto P = CmpPredicate::getMatching(FoundPred, Pred)) - return isImpliedCondOperands(*P, LHS, RHS, FoundLHS, FoundRHS, CtxI); + // FIXME: use CmpPredicate::getMatching here. + if (FoundPred == static_cast(Pred)) + return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, CtxI); // Check whether swapping the found predicate makes it the same as the // desired predicate. - if (auto P = CmpPredicate::getMatching( - ICmpInst::getSwappedCmpPredicate(FoundPred), Pred)) { + // FIXME: use CmpPredicate::getMatching here. + if (ICmpInst::getSwappedCmpPredicate(FoundPred) == + static_cast(Pred)) { // We can write the implication // 0. LHS Pred RHS <- FoundLHS SwapPred FoundRHS // using one of the following ways: @@ -11877,23 +11879,22 @@ bool ScalarEvolution::isImpliedCondBalancedTypes( // Forms 1. and 2. require swapping the operands of one condition. Don't // do this if it would break canonical constant/addrec ordering. if (!isa(RHS) && !isa(LHS)) - return isImpliedCondOperands(ICmpInst::getSwappedCmpPredicate(*P), RHS, - LHS, FoundLHS, FoundRHS, CtxI); + return isImpliedCondOperands(FoundPred, RHS, LHS, FoundLHS, FoundRHS, + CtxI); if (!isa(FoundRHS) && !isa(FoundLHS)) - return isImpliedCondOperands(*P, LHS, RHS, FoundRHS, FoundLHS, CtxI); + return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS, CtxI); // There's no clear preference between forms 3. and 4., try both. Avoid // forming getNotSCEV of pointer values as the resulting subtract is // not legal. if (!LHS->getType()->isPointerTy() && !RHS->getType()->isPointerTy() && - isImpliedCondOperands(ICmpInst::getSwappedCmpPredicate(*P), - getNotSCEV(LHS), getNotSCEV(RHS), FoundLHS, - FoundRHS, CtxI)) + isImpliedCondOperands(FoundPred, getNotSCEV(LHS), getNotSCEV(RHS), + FoundLHS, FoundRHS, CtxI)) return true; if (!FoundLHS->getType()->isPointerTy() && !FoundRHS->getType()->isPointerTy() && - isImpliedCondOperands(*P, LHS, RHS, getNotSCEV(FoundLHS), + isImpliedCondOperands(Pred, LHS, RHS, getNotSCEV(FoundLHS), getNotSCEV(FoundRHS), CtxI)) return true; @@ -12569,16 +12570,14 @@ bool ScalarEvolution::isImpliedViaOperations(CmpPredicate Pred, const SCEV *LHS, return false; // We only want to work with GT comparison so far. - if (ICmpInst::isLT(Pred)) { + if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT) { Pred = ICmpInst::getSwappedCmpPredicate(Pred); std::swap(LHS, RHS); std::swap(FoundLHS, FoundRHS); } - CmpInst::Predicate P = Pred.getPreferredSignedPredicate(); - // For unsigned, try to reduce it to corresponding signed comparison. - if (P == ICmpInst::ICMP_UGT) + if (Pred == ICmpInst::ICMP_UGT) // We can replace unsigned predicate with its signed counterpart if all // involved values are non-negative. // TODO: We could have better support for unsigned. @@ -12591,10 +12590,10 @@ bool ScalarEvolution::isImpliedViaOperations(CmpPredicate Pred, const SCEV *LHS, FoundRHS) && isImpliedCondOperands(ICmpInst::ICMP_SGT, RHS, MinusOne, FoundLHS, FoundRHS)) - P = ICmpInst::ICMP_SGT; + Pred = ICmpInst::ICMP_SGT; } - if (P != ICmpInst::ICMP_SGT) + if (Pred != ICmpInst::ICMP_SGT) return false; auto GetOpFromSExt = [&](const SCEV *S) { diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-samesign.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-samesign.ll index 4d569cc69fa2b..93c6bc08af2a0 100644 --- a/llvm/test/Analysis/ScalarEvolution/exit-count-samesign.ll +++ b/llvm/test/Analysis/ScalarEvolution/exit-count-samesign.ll @@ -5,9 +5,9 @@ define i32 @exit_count_samesign(i32 %iter.count, ptr %ptr) { ; CHECK-LABEL: 'exit_count_samesign' ; CHECK-NEXT: Determining loop execution counts for: @exit_count_samesign -; CHECK-NEXT: Loop %inner.loop: backedge-taken count is {(-2 + %iter.count),+,-1}<%outer.loop> +; CHECK-NEXT: Loop %inner.loop: backedge-taken count is (-1 + (1 smax {(-1 + %iter.count),+,-1}<%outer.loop>)) ; CHECK-NEXT: Loop %inner.loop: constant max backedge-taken count is i32 2147483646 -; CHECK-NEXT: Loop %inner.loop: symbolic max backedge-taken count is {(-2 + %iter.count),+,-1}<%outer.loop> +; CHECK-NEXT: Loop %inner.loop: symbolic max backedge-taken count is (-1 + (1 smax {(-1 + %iter.count),+,-1}<%outer.loop>)) ; CHECK-NEXT: Loop %inner.loop: Trip multiple is 1 ; CHECK-NEXT: Loop %outer.loop: Unpredictable backedge-taken count. ; CHECK-NEXT: Loop %outer.loop: Unpredictable constant max backedge-taken count. diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll index d83301243ef30..a1d30406095ec 100644 --- a/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll +++ b/llvm/test/Analysis/ScalarEvolution/implied-via-division.ll @@ -2,10 +2,12 @@ ; RUN: opt < %s -disable-output -passes="print" \ ; RUN: -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s -define void @implied1(i32 %n) { -; Prove that (n s> 1) ===> (n / 2 s> 0). -; CHECK-LABEL: 'implied1' -; CHECK-NEXT: Determining loop execution counts for: @implied1 +declare void @llvm.experimental.guard(i1, ...) + +define void @test_1(i32 %n) nounwind { +; Prove that (n > 1) ===> (n / 2 > 0). +; CHECK-LABEL: 'test_1' +; CHECK-NEXT: Determining loop execution counts for: @test_1 ; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + %n.div.2) ; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 ; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + %n.div.2) @@ -27,35 +29,10 @@ exit: ret void } -define void @implied1_samesign(i32 %n) { -; Prove that (n > 1) ===> (n / 2 s> 0). -; CHECK-LABEL: 'implied1_samesign' -; CHECK-NEXT: Determining loop execution counts for: @implied1_samesign -; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + %n.div.2) -; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 -; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + %n.div.2) -; CHECK-NEXT: Loop %header: Trip multiple is 1 -; -entry: - %cmp1 = icmp samesign ugt i32 %n, 1 - %n.div.2 = sdiv i32 %n, 2 - call void @llvm.assume(i1 %cmp1) - br label %header - -header: - %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] - %indvar.next = add i32 %indvar, 1 - %exitcond = icmp sgt i32 %n.div.2, %indvar.next - br i1 %exitcond, label %header, label %exit - -exit: - ret void -} - -define void @implied1_neg(i32 %n) { -; Prove that (n s> 0) =\=> (n / 2 s> 0). -; CHECK-LABEL: 'implied1_neg' -; CHECK-NEXT: Determining loop execution counts for: @implied1_neg +define void @test_1neg(i32 %n) nounwind { +; Prove that (n > 0) =\=> (n / 2 > 0). +; CHECK-LABEL: 'test_1neg' +; CHECK-NEXT: Determining loop execution counts for: @test_1neg ; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) ; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 ; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n.div.2)) @@ -77,10 +54,10 @@ exit: ret void } -define void @implied2(i32 %n) { -; Prove that (n s>= 2) ===> (n / 2 s> 0). -; CHECK-LABEL: 'implied2' -; CHECK-NEXT: Determining loop execution counts for: @implied2 +define void @test_2(i32 %n) nounwind { +; Prove that (n >= 2) ===> (n / 2 > 0). +; CHECK-LABEL: 'test_2' +; CHECK-NEXT: Determining loop execution counts for: @test_2 ; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + %n.div.2) ; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 ; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + %n.div.2) @@ -102,35 +79,10 @@ exit: ret void } -define void @implied2_samesign(i32 %n) { -; Prove that (n >= 2) ===> (n / 2 s> 0). -; CHECK-LABEL: 'implied2_samesign' -; CHECK-NEXT: Determining loop execution counts for: @implied2_samesign -; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) -; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 -; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n.div.2)) -; CHECK-NEXT: Loop %header: Trip multiple is 1 -; -entry: - %cmp1 = icmp samesign uge i32 %n, 2 - %n.div.2 = sdiv i32 %n, 2 - call void @llvm.assume(i1 %cmp1) - br label %header - -header: - %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] - %indvar.next = add i32 %indvar, 1 - %exitcond = icmp sgt i32 %n.div.2, %indvar.next - br i1 %exitcond, label %header, label %exit - -exit: - ret void -} - -define void @implied2_neg(i32 %n) { -; Prove that (n s>= 1) =\=> (n / 2 s> 0). -; CHECK-LABEL: 'implied2_neg' -; CHECK-NEXT: Determining loop execution counts for: @implied2_neg +define void @test_2neg(i32 %n) nounwind { +; Prove that (n >= 1) =\=> (n / 2 > 0). +; CHECK-LABEL: 'test_2neg' +; CHECK-NEXT: Determining loop execution counts for: @test_2neg ; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2)) ; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741822 ; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n.div.2)) @@ -152,10 +104,10 @@ exit: ret void } -define void @implied3(i32 %n) { -; Prove that (n s> -2) ===> (n / 2 s>= 0). -; CHECK-LABEL: 'implied3' -; CHECK-NEXT: Determining loop execution counts for: @implied3 +define void @test_3(i32 %n) nounwind { +; Prove that (n > -2) ===> (n / 2 >= 0). +; CHECK-LABEL: 'test_3' +; CHECK-NEXT: Determining loop execution counts for: @test_3 ; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) ; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 ; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) @@ -177,35 +129,10 @@ exit: ret void } -define void @implied3_samesign(i32 %n) { -; Prove that (n > -2) ===> (n / 2 s>= 0). -; CHECK-LABEL: 'implied3_samesign' -; CHECK-NEXT: Determining loop execution counts for: @implied3_samesign -; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) -; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1 -; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) -; CHECK-NEXT: Loop %header: Trip multiple is 1 -; -entry: - %cmp1 = icmp samesign ugt i32 %n, -2 - %n.div.2 = sdiv i32 %n, 2 - call void @llvm.assume(i1 %cmp1) - br label %header - -header: - %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] - %indvar.next = add i32 %indvar, 1 - %exitcond = icmp sge i32 %n.div.2, %indvar - br i1 %exitcond, label %header, label %exit - -exit: - ret void -} - -define void @implied3_neg(i32 %n) { +define void @test_3neg(i32 %n) nounwind { ; Prove that (n > -3) =\=> (n / 2 >= 0). -; CHECK-LABEL: 'implied3_neg' -; CHECK-NEXT: Determining loop execution counts for: @implied3_neg +; CHECK-LABEL: 'test_3neg' +; CHECK-NEXT: Determining loop execution counts for: @test_3neg ; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) ; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 ; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + %n.div.2)) @@ -227,10 +154,10 @@ exit: ret void } -define void @implied4(i32 %n) { -; Prove that (n s>= -1) ===> (n / 2 s>= 0). -; CHECK-LABEL: 'implied4' -; CHECK-NEXT: Determining loop execution counts for: @implied4 +define void @test_4(i32 %n) nounwind { +; Prove that (n >= -1) ===> (n / 2 >= 0). +; CHECK-LABEL: 'test_4' +; CHECK-NEXT: Determining loop execution counts for: @test_4 ; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) ; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 ; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) @@ -252,35 +179,10 @@ exit: ret void } -define void @implied4_samesign(i32 %n) { -; Prove that (n >= -1) ===> (n / 2 s>= 0). -; CHECK-LABEL: 'implied4_samesign' -; CHECK-NEXT: Determining loop execution counts for: @implied4_samesign -; CHECK-NEXT: Loop %header: backedge-taken count is (1 + %n.div.2) -; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1 -; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (1 + %n.div.2) -; CHECK-NEXT: Loop %header: Trip multiple is 1 -; -entry: - %cmp1 = icmp samesign uge i32 %n, -1 - %n.div.2 = sdiv i32 %n, 2 - call void @llvm.assume(i1 %cmp1) - br label %header - -header: - %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] - %indvar.next = add i32 %indvar, 1 - %exitcond = icmp sge i32 %n.div.2, %indvar - br i1 %exitcond, label %header, label %exit - -exit: - ret void -} - -define void @implied4_neg(i32 %n) { -; Prove that (n s>= -2) =\=> (n / 2 s>= 0). -; CHECK-LABEL: 'implied4_neg' -; CHECK-NEXT: Determining loop execution counts for: @implied4_neg +define void @test_4neg(i32 %n) nounwind { +; Prove that (n >= -2) =\=> (n / 2 >= 0). +; CHECK-LABEL: 'test_4neg' +; CHECK-NEXT: Determining loop execution counts for: @test_4neg ; CHECK-NEXT: Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)) ; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 1073741824 ; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (0 smax (1 + %n.div.2)) diff --git a/llvm/test/Transforms/IndVarSimplify/iv-ext-samesign.ll b/llvm/test/Transforms/IndVarSimplify/iv-ext-samesign.ll index c4e26c98ed24a..1207f47c5e3c9 100644 --- a/llvm/test/Transforms/IndVarSimplify/iv-ext-samesign.ll +++ b/llvm/test/Transforms/IndVarSimplify/iv-ext-samesign.ll @@ -68,32 +68,28 @@ define i32 @iv_zext_zext_gt_slt(i32 %iter.count, ptr %ptr) { ; CHECK-LABEL: define i32 @iv_zext_zext_gt_slt( ; CHECK-SAME: i32 [[ITER_COUNT:%.*]], ptr [[PTR:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[ITER_COUNT]], -1 +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[ITER_COUNT]] to i64 ; CHECK-NEXT: br label %[[OUTER_LOOP:.*]] ; CHECK: [[PH_LOOPEXIT:.*]]: ; CHECK-NEXT: br label %[[PH:.*]] ; CHECK: [[PH]]: -; CHECK-NEXT: [[INDVARS_IV_NEXT3:%.*]] = add i32 [[INDVARS_IV1:%.*]], -1 ; CHECK-NEXT: br label %[[OUTER_LOOP]] ; CHECK: [[OUTER_LOOP]]: -; CHECK-NEXT: [[INDVARS_IV1]] = phi i32 [ [[INDVARS_IV_NEXT3]], %[[PH]] ], [ [[TMP0]], %[[ENTRY]] ] -; CHECK-NEXT: [[IV_OUTER:%.*]] = phi i32 [ [[IV_OUTER_1:%.*]], %[[PH]] ], [ [[ITER_COUNT]], %[[ENTRY]] ] -; CHECK-NEXT: [[IV_OUTER_1]] = add nsw i32 [[IV_OUTER]], -1 -; CHECK-NEXT: [[INDVARS_IV_NEXT2:%.*]] = zext nneg i32 [[IV_OUTER_1]] to i64 +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], %[[PH]] ], [ [[TMP0]], %[[ENTRY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT2]] = add nsw i64 [[INDVARS_IV1]], -1 ; CHECK-NEXT: [[GEP_OUTER:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[INDVARS_IV_NEXT2]] ; CHECK-NEXT: store i8 0, ptr [[GEP_OUTER]], align 1 -; CHECK-NEXT: [[EXIT_COND_OUTER:%.*]] = icmp samesign ugt i32 [[IV_OUTER]], 1 +; CHECK-NEXT: [[EXIT_COND_OUTER:%.*]] = icmp samesign ugt i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: br i1 [[EXIT_COND_OUTER]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[PH]] ; CHECK: [[INNER_LOOP_PREHEADER]]: -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[INDVARS_IV1]] to i64 ; CHECK-NEXT: br label %[[INNER_LOOP:.*]] ; CHECK: [[INNER_LOOP]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[INNER_LOOP_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[INNER_LOOP]] ] ; CHECK-NEXT: [[GEP_INNER:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[GEP_INNER]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[INNER_LOOP]], label %[[PH_LOOPEXIT]] +; CHECK-NEXT: [[EXIT_COND_INNER:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[INDVARS_IV_NEXT2]] +; CHECK-NEXT: br i1 [[EXIT_COND_INNER]], label %[[INNER_LOOP]], label %[[PH_LOOPEXIT]] ; CHECK: [[EXIT:.*:]] ; CHECK-NEXT: ret i32 0 ; @@ -428,32 +424,28 @@ define i32 @iv_sext_sext_gt_slt(i32 %iter.count, ptr %ptr) { ; CHECK-LABEL: define i32 @iv_sext_sext_gt_slt( ; CHECK-SAME: i32 [[ITER_COUNT:%.*]], ptr [[PTR:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[ITER_COUNT]], -1 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[ITER_COUNT]] to i64 ; CHECK-NEXT: br label %[[OUTER_LOOP:.*]] ; CHECK: [[PH_LOOPEXIT:.*]]: ; CHECK-NEXT: br label %[[PH:.*]] ; CHECK: [[PH]]: -; CHECK-NEXT: [[INDVARS_IV_NEXT3:%.*]] = add i32 [[INDVARS_IV2:%.*]], -1 ; CHECK-NEXT: br label %[[OUTER_LOOP]] ; CHECK: [[OUTER_LOOP]]: ; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], %[[PH]] ], [ [[TMP0]], %[[ENTRY]] ] -; CHECK-NEXT: [[INDVARS_IV2]] = phi i32 [ [[INDVARS_IV_NEXT3]], %[[PH]] ], [ [[TMP1]], %[[ENTRY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT2]] = add nsw i64 [[INDVARS_IV1]], -1 ; CHECK-NEXT: [[GEP_OUTER:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[INDVARS_IV_NEXT2]] ; CHECK-NEXT: store i8 0, ptr [[GEP_OUTER]], align 1 ; CHECK-NEXT: [[EXIT_COND_OUTER:%.*]] = icmp samesign ugt i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: br i1 [[EXIT_COND_OUTER]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[PH]] ; CHECK: [[INNER_LOOP_PREHEADER]]: -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[INDVARS_IV2]] to i64 ; CHECK-NEXT: br label %[[INNER_LOOP:.*]] ; CHECK: [[INNER_LOOP]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[INNER_LOOP_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[INNER_LOOP]] ] ; CHECK-NEXT: [[GEP_INNER:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i8 0, ptr [[GEP_INNER]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[INNER_LOOP]], label %[[PH_LOOPEXIT]] +; CHECK-NEXT: [[EXIT_COND_INNER:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[INDVARS_IV_NEXT2]] +; CHECK-NEXT: br i1 [[EXIT_COND_INNER]], label %[[INNER_LOOP]], label %[[PH_LOOPEXIT]] ; CHECK: [[EXIT:.*:]] ; CHECK-NEXT: ret i32 0 ; From 71adb054024a1e9bd5ed4566beda74dea65362cd Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 10 Feb 2025 09:00:31 -0500 Subject: [PATCH 151/293] [clang] Expose -f(no-)strict-overflow as a clang-cl option (#126512) Also move the -fno-strict-overflow option definition next to the -fstrict-overflow one while here. Also add test coverage for f(no-)wrapv-pointer being a clang-cl option. --- clang/include/clang/Driver/Options.td | 6 +++--- clang/test/Driver/cl-options.c | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index c9d192a20ff1f..1cf62ab466134 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3421,8 +3421,6 @@ def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group, def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group; def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group; def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group; -def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group, - Visibility<[ClangOption, FlangOption]>; defm init_global_zero : BoolOptionWithoutMarshalling<"f", "init-global-zero", PosFlag, @@ -3934,7 +3932,9 @@ defm strict_vtable_pointers : BoolFOption<"strict-vtable-pointers", " overwriting polymorphic C++ objects">, NegFlag>; def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group, - Visibility<[ClangOption, FlangOption]>; + Visibility<[ClangOption, CLOption, FlangOption]>; +def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group, + Visibility<[ClangOption, CLOption, FlangOption]>; def fpointer_tbaa : Flag<["-"], "fpointer-tbaa">, Group; def fdriver_only : Flag<["-"], "fdriver-only">, Flags<[NoXarchOption]>, Visibility<[ClangOption, CLOption, DXCOption]>, diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 29a0fcbc17ac6..9f9ca1bf1a8fd 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -738,9 +738,13 @@ // RUN: -fno-modules-search-all \ // RUN: -fimplicit-modules \ // RUN: -fno-implicit-modules \ +// RUN: -fstrict-overflow \ +// RUN: -fno-strict-overflow \ // RUN: -ftrivial-auto-var-init=zero \ // RUN: -fwrapv \ // RUN: -fno-wrapv \ +// RUN: -fwrapv-pointer \ +// RUN: -fno-wrapv-pointer \ // RUN: --version \ // RUN: -Werror /Zs -- %s 2>&1 From 4d2a1bf563556d12cccc4cace1c2e225a3c002e4 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Mon, 10 Feb 2025 15:05:49 +0100 Subject: [PATCH 152/293] [clang] CTAD alias: Respect explicit deduction guides defined after the first use of the alias template. (#125478) Fixes #103016 This is the last missing piece for the C++20 CTAD alias feature. No release note being added in this PR yet, I will send out a follow-up patch to mark this feature done. (Since the release 20 branch is cut, I think we should target on clang21). --- clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 37 +++++++++++++++---- clang/test/SemaCXX/cxx20-ctad-type-alias.cpp | 18 +++++++-- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index 0d079677eecc5..e5931f4684a57 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -740,6 +740,28 @@ bool hasDeclaredDeductionGuides(DeclarationName Name, DeclContext *DC) { return false; } +// Returns all source deduction guides associated with the declared +// deduction guides that have the specified deduction guide name. +llvm::DenseSet getSourceDeductionGuides(DeclarationName Name, + DeclContext *DC) { + assert(Name.getNameKind() == + DeclarationName::NameKind::CXXDeductionGuideName && + "name must be a deduction guide name"); + llvm::DenseSet Result; + for (auto *D : DC->lookup(Name)) { + if (const auto *FTD = dyn_cast(D)) + D = FTD->getTemplatedDecl(); + + if (const auto *GD = dyn_cast(D)) { + assert(GD->getSourceDeductionGuide() && + "deduction guide for alias template must have a source deduction " + "guide"); + Result.insert(GD->getSourceDeductionGuide()); + } + } + return Result; +} + // Build the associated constraints for the alias deduction guides. // C++ [over.match.class.deduct]p3.3: // The associated constraints ([temp.constr.decl]) are the conjunction of the @@ -1191,17 +1213,14 @@ void DeclareImplicitDeductionGuidesForTypeAlias( if (AliasTemplate->isInvalidDecl()) return; auto &Context = SemaRef.Context; - // FIXME: if there is an explicit deduction guide after the first use of the - // type alias usage, we will not cover this explicit deduction guide. fix this - // case. - if (hasDeclaredDeductionGuides( - Context.DeclarationNames.getCXXDeductionGuideName(AliasTemplate), - AliasTemplate->getDeclContext())) - return; auto [Template, AliasRhsTemplateArgs] = getRHSTemplateDeclAndArgs(SemaRef, AliasTemplate); if (!Template) return; + auto SourceDeductionGuides = getSourceDeductionGuides( + Context.DeclarationNames.getCXXDeductionGuideName(AliasTemplate), + AliasTemplate->getDeclContext()); + DeclarationNameInfo NameInfo( Context.DeclarationNames.getCXXDeductionGuideName(Template), Loc); LookupResult Guides(SemaRef, NameInfo, clang::Sema::LookupOrdinaryName); @@ -1210,6 +1229,8 @@ void DeclareImplicitDeductionGuidesForTypeAlias( for (auto *G : Guides) { if (auto *DG = dyn_cast(G)) { + if (SourceDeductionGuides.contains(DG)) + continue; // The deduction guide is a non-template function decl, we just clone it. auto *FunctionType = SemaRef.Context.getTrivialTypeSourceInfo(DG->getType()); @@ -1252,7 +1273,7 @@ void DeclareImplicitDeductionGuidesForTypeAlias( continue; } FunctionTemplateDecl *F = dyn_cast(G); - if (!F) + if (!F || SourceDeductionGuides.contains(F->getTemplatedDecl())) continue; // The **aggregate** deduction guides are handled in a different code path // (DeclareAggregateDeductionGuideFromInitList), which involves the tricky diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp index 2d43e46b9e3d7..37dca2215af6b 100644 --- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp +++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp @@ -234,11 +234,23 @@ int i = 0; AFoo s{i}; static_assert(__is_same(decltype(s.t), int)); +template +using BFoo = AFoo; + +// template explicit deduction guide. +template +Foo(T) -> Foo; +static_assert(__is_same(decltype(AFoo(i).t), float)); +static_assert(__is_same(decltype(BFoo(i).t), float)); + // explicit deduction guide. Foo(int) -> Foo; -AFoo s2{i}; -// FIXME: the type should be X because of the above explicit deduction guide. -static_assert(__is_same(decltype(s2.t), int)); +static_assert(__is_same(decltype(AFoo(i).t), X)); +static_assert(__is_same(decltype(BFoo(i).t), X)); + +Foo(double) -> Foo; +static_assert(__is_same(decltype(AFoo(1.0).t), int)); +static_assert(__is_same(decltype(BFoo(1.0).t), int)); } // namespace test16 namespace test17 { From 1aa48af1f86009365524d43966bb40ea246fea47 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 10 Feb 2025 15:11:01 +0100 Subject: [PATCH 153/293] [clang][bytecode][NFC] Discard all CastExprs uniformly (#126511) --- clang/lib/AST/ByteCode/Compiler.cpp | 45 +++-------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 86a3773d74d05..cf39209819ade 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -194,12 +194,12 @@ template class StmtExprScope final { template bool Compiler::VisitCastExpr(const CastExpr *CE) { const Expr *SubExpr = CE->getSubExpr(); - switch (CE->getCastKind()) { - case CK_LValueToRValue: { - if (DiscardResult) - return this->discard(SubExpr); + if (DiscardResult) + return this->delegate(SubExpr); + switch (CE->getCastKind()) { + case CK_LValueToRValue: { std::optional SubExprT = classify(SubExpr->getType()); // Prepare storage for the result. if (!Initializing && !SubExprT) { @@ -253,9 +253,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { case CK_UncheckedDerivedToBase: case CK_DerivedToBase: { - if (DiscardResult) - return this->discard(SubExpr); - if (!this->delegate(SubExpr)) return false; @@ -285,9 +282,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { } case CK_BaseToDerived: { - if (DiscardResult) - return this->discard(SubExpr); - if (!this->delegate(SubExpr)) return false; @@ -302,8 +296,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { if (!SubExpr->getType()->isFloatingType() || !CE->getType()->isFloatingType()) return false; - if (DiscardResult) - return this->discard(SubExpr); if (!this->visit(SubExpr)) return false; const auto *TargetSemantics = &Ctx.getFloatSemantics(CE->getType()); @@ -311,8 +303,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { } case CK_IntegralToFloating: { - if (DiscardResult) - return this->discard(SubExpr); std::optional FromT = classify(SubExpr->getType()); if (!FromT) return false; @@ -327,8 +317,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { case CK_FloatingToBoolean: case CK_FloatingToIntegral: { - if (DiscardResult) - return this->discard(SubExpr); std::optional ToT = classify(CE->getType()); @@ -352,9 +340,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { case CK_NullToMemberPointer: { if (!this->discard(SubExpr)) return false; - if (DiscardResult) - return true; - const Descriptor *Desc = nullptr; const QualType PointeeType = CE->getType()->getPointeeType(); if (!PointeeType.isNull()) { @@ -371,9 +356,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { } case CK_PointerToIntegral: { - if (DiscardResult) - return this->discard(SubExpr); - if (!this->visit(SubExpr)) return false; @@ -399,8 +381,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { return false; if (!this->emitArrayDecay(CE)) return false; - if (DiscardResult) - return this->emitPopPtr(CE); return true; } @@ -412,9 +392,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { // FIXME: I think the discard is wrong since the int->ptr cast might cause a // diagnostic. PrimType T = classifyPrim(IntType); - if (DiscardResult) - return this->emitPop(T, CE); - QualType PtrType = CE->getType(); const Descriptor *Desc; if (std::optional T = classify(PtrType->getPointeeType())) @@ -454,10 +431,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { return false; return this->emitInvalidCast(CastKind::Reinterpret, /*Fatal=*/true, CE); } - - if (DiscardResult) - return this->discard(SubExpr); - QualType SubExprTy = SubExpr->getType(); std::optional FromT = classify(SubExprTy); // Casts from integer/vector to vector. @@ -493,8 +466,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { case CK_FixedPointToBoolean: case CK_BooleanToSignedIntegral: case CK_IntegralCast: { - if (DiscardResult) - return this->discard(SubExpr); std::optional FromT = classify(SubExpr->getType()); std::optional ToT = classify(CE->getType()); @@ -546,8 +517,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { case CK_IntegralComplexToBoolean: case CK_FloatingComplexToBoolean: { - if (DiscardResult) - return this->discard(SubExpr); if (!this->visit(SubExpr)) return false; return this->emitComplexBoolCast(SubExpr); @@ -585,9 +554,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { case CK_FloatingComplexToIntegralComplex: { assert(CE->getType()->isAnyComplexType()); assert(SubExpr->getType()->isAnyComplexType()); - if (DiscardResult) - return this->discard(SubExpr); - if (!Initializing) { std::optional LocalIndex = allocateLocal(CE); if (!LocalIndex) @@ -633,9 +599,6 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { assert(classify(SubExpr->getType())); assert(CE->getType()->isVectorType()); - if (DiscardResult) - return this->discard(SubExpr); - if (!Initializing) { std::optional LocalIndex = allocateLocal(CE); if (!LocalIndex) From ec60e1d8e2c265f86f08590b6061eb6f51dc3349 Mon Sep 17 00:00:00 2001 From: zhijian lin Date: Mon, 10 Feb 2025 09:37:04 -0500 Subject: [PATCH 154/293] [XCOFF][llvm-readobj] Print symbol value kind when dumping symbols (#125861) llvm-readobj print out symbol value name for xcoff symbol table. reference doc: https://www.ibm.com/docs/en/aix/7.2?topic=formats-xcoff-object-file-format#XCOFF__yaa3i18fjbau --- .../tools/llvm-readobj/XCOFF/symbols.test | 166 ++++++++++++++++++ .../tools/llvm-readobj/XCOFF/symbols64.test | 166 ++++++++++++++++++ llvm/tools/llvm-readobj/XCOFFDumper.cpp | 10 +- 3 files changed, 339 insertions(+), 3 deletions(-) diff --git a/llvm/test/tools/llvm-readobj/XCOFF/symbols.test b/llvm/test/tools/llvm-readobj/XCOFF/symbols.test index 71347a85f1ba5..64e0feb966ee1 100644 --- a/llvm/test/tools/llvm-readobj/XCOFF/symbols.test +++ b/llvm/test/tools/llvm-readobj/XCOFF/symbols.test @@ -163,6 +163,73 @@ Symbols: StabInfoIndex: 44 StabSectNum: 55 + - Name: bstat + Value: 0x0 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_BSTAT + + - Name: cfun + Value: 0x0 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_FUN + + - Name: stsym + Value: 0x05 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_STSYM + + - Name: bincl + Value: 0x06 + Section: .text + Type: 0x00 + StorageClass: C_BINCL + + - Name: eincl + Value: 0x06 + Section: .text + Type: 0x00 + StorageClass: C_EINCL + + - Name: lsym + Value: 0x07 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_LSYM + + - Name: psym + Value: 0x07 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_PSYM + + - Name: rsym + Value: 0x08 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_RSYM + + - Name: rpsym + Value: 0x08 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_RPSYM + + - Name: ecoml + Value: 0x09 + Section: .text + Type: 0x00 + StorageClass: C_ECOML + + - Name: cinfo + Value: 0x02 + Section: .text + Type: 0x00 + StorageClass: C_INFO + + # SYMBOL32: Symbols [ # SYMBOL32-NEXT: Symbol { # SYMBOL32-NEXT: Index: 0 @@ -368,4 +435,103 @@ Symbols: # SYMBOL32-NEXT: StabSectNum: 0x37 # SYMBOL32-NEXT: } # SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 25 +# SYMBOL32-NEXT: Name: Unimplemented Debug Name +# SYMBOL32-NEXT: Value (SymbolTableIndex): 0x0 +# SYMBOL32-NEXT: Section: N_DEBUG +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_BSTAT (0x8F) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 26 +# SYMBOL32-NEXT: Name: Unimplemented Debug Name +# SYMBOL32-NEXT: Value (OffsetInCSect): 0x0 +# SYMBOL32-NEXT: Section: N_DEBUG +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_FUN (0x8E) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 27 +# SYMBOL32-NEXT: Name: Unimplemented Debug Name +# SYMBOL32-NEXT: Value (OffsetInCSect): 0x5 +# SYMBOL32-NEXT: Section: N_DEBUG +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_STSYM (0x85) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 28 +# SYMBOL32-NEXT: Name: bincl +# SYMBOL32-NEXT: Value (OffsetInFile): 0x6 +# SYMBOL32-NEXT: Section: .text +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_BINCL (0x6C) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 29 +# SYMBOL32-NEXT: Name: eincl +# SYMBOL32-NEXT: Value (OffsetInFile): 0x6 +# SYMBOL32-NEXT: Section: .text +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_EINCL (0x6D) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 30 +# SYMBOL32-NEXT: Name: Unimplemented Debug Name +# SYMBOL32-NEXT: Value (OffsetRelToStackFrame): 0x7 +# SYMBOL32-NEXT: Section: N_DEBUG +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_LSYM (0x81) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 31 +# SYMBOL32-NEXT: Name: Unimplemented Debug Name +# SYMBOL32-NEXT: Value (OffsetRelToStackFrame): 0x7 +# SYMBOL32-NEXT: Section: N_DEBUG +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_PSYM (0x82) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 32 +# SYMBOL32-NEXT: Name: Unimplemented Debug Name +# SYMBOL32-NEXT: Value (RegisterNumber): 0x8 +# SYMBOL32-NEXT: Section: N_DEBUG +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_RSYM (0x83) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 33 +# SYMBOL32-NEXT: Name: Unimplemented Debug Name +# SYMBOL32-NEXT: Value (RegisterNumber): 0x8 +# SYMBOL32-NEXT: Section: N_DEBUG +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_RPSYM (0x84) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 34 +# SYMBOL32-NEXT: Name: Unimplemented Debug Name +# SYMBOL32-NEXT: Value (OffsetInCommBlock): 0x9 +# SYMBOL32-NEXT: Section: .text +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_ECOML (0x88) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } +# SYMBOL32-NEXT: Symbol { +# SYMBOL32-NEXT: Index: 35 +# SYMBOL32-NEXT: Name: cinfo +# SYMBOL32-NEXT: Value (OffsetInCommentSection): 0x2 +# SYMBOL32-NEXT: Section: .text +# SYMBOL32-NEXT: Type: 0x0 +# SYMBOL32-NEXT: StorageClass: C_INFO (0x6E) +# SYMBOL32-NEXT: NumberOfAuxEntries: 0 +# SYMBOL32-NEXT: } # SYMBOL32-NEXT: ] diff --git a/llvm/test/tools/llvm-readobj/XCOFF/symbols64.test b/llvm/test/tools/llvm-readobj/XCOFF/symbols64.test index 2b9edb3829af8..462ac5f4a3161 100644 --- a/llvm/test/tools/llvm-readobj/XCOFF/symbols64.test +++ b/llvm/test/tools/llvm-readobj/XCOFF/symbols64.test @@ -138,6 +138,73 @@ Symbols: - Type: AUX_SYM LineNum: 3 + - Name: bstat + Value: 0x0 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_BSTAT + + - Name: cfun + Value: 0x0 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_FUN + + - Name: stsym + Value: 0x05 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_STSYM + + - Name: bincl + Value: 0x06 + Section: .text + Type: 0x00 + StorageClass: C_BINCL + + - Name: eincl + Value: 0x06 + Section: .text + Type: 0x00 + StorageClass: C_EINCL + + - Name: lsym + Value: 0x07 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_LSYM + + - Name: psym + Value: 0x07 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_PSYM + + - Name: rsym + Value: 0x08 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_RSYM + + - Name: rpsym + Value: 0x08 + Section: N_DEBUG + Type: 0x00 + StorageClass: C_RPSYM + + - Name: ecoml + Value: 0x09 + Section: .text + Type: 0x00 + StorageClass: C_ECOML + + - Name: cinfo + Value: 0x02 + Section: .text + Type: 0x00 + StorageClass: C_INFO + + # SYMBOL64: Symbols [ # SYMBOL64-NEXT: Symbol { # SYMBOL64-NEXT: Index: 0 @@ -326,4 +393,103 @@ Symbols: # SYMBOL64-NEXT: Auxiliary Type: AUX_SYM (0xFD) # SYMBOL64-NEXT: } # SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 23 +# SYMBOL64-NEXT: Name: Unimplemented Debug Name +# SYMBOL64-NEXT: Value (SymbolTableIndex): 0x0 +# SYMBOL64-NEXT: Section: N_DEBUG +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_BSTAT (0x8F) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 24 +# SYMBOL64-NEXT: Name: Unimplemented Debug Name +# SYMBOL64-NEXT: Value (OffsetInCSect): 0x0 +# SYMBOL64-NEXT: Section: N_DEBUG +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_FUN (0x8E) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 25 +# SYMBOL64-NEXT: Name: Unimplemented Debug Name +# SYMBOL64-NEXT: Value (OffsetInCSect): 0x5 +# SYMBOL64-NEXT: Section: N_DEBUG +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_STSYM (0x85) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 26 +# SYMBOL64-NEXT: Name: bincl +# SYMBOL64-NEXT: Value (OffsetInFile): 0x6 +# SYMBOL64-NEXT: Section: .text +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_BINCL (0x6C) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 27 +# SYMBOL64-NEXT: Name: eincl +# SYMBOL64-NEXT: Value (OffsetInFile): 0x6 +# SYMBOL64-NEXT: Section: .text +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_EINCL (0x6D) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 28 +# SYMBOL64-NEXT: Name: Unimplemented Debug Name +# SYMBOL64-NEXT: Value (OffsetRelToStackFrame): 0x7 +# SYMBOL64-NEXT: Section: N_DEBUG +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_LSYM (0x81) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 29 +# SYMBOL64-NEXT: Name: Unimplemented Debug Name +# SYMBOL64-NEXT: Value (OffsetRelToStackFrame): 0x7 +# SYMBOL64-NEXT: Section: N_DEBUG +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_PSYM (0x82) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 30 +# SYMBOL64-NEXT: Name: Unimplemented Debug Name +# SYMBOL64-NEXT: Value (RegisterNumber): 0x8 +# SYMBOL64-NEXT: Section: N_DEBUG +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_RSYM (0x83) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 31 +# SYMBOL64-NEXT: Name: Unimplemented Debug Name +# SYMBOL64-NEXT: Value (RegisterNumber): 0x8 +# SYMBOL64-NEXT: Section: N_DEBUG +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_RPSYM (0x84) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 32 +# SYMBOL64-NEXT: Name: Unimplemented Debug Name +# SYMBOL64-NEXT: Value (OffsetInCommBlock): 0x9 +# SYMBOL64-NEXT: Section: .text +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_ECOML (0x88) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } +# SYMBOL64-NEXT: Symbol { +# SYMBOL64-NEXT: Index: 33 +# SYMBOL64-NEXT: Name: cinfo +# SYMBOL64-NEXT: Value (OffsetInCommentSection): 0x2 +# SYMBOL64-NEXT: Section: .text +# SYMBOL64-NEXT: Type: 0x0 +# SYMBOL64-NEXT: StorageClass: C_INFO (0x6E) +# SYMBOL64-NEXT: NumberOfAuxEntries: 0 +# SYMBOL64-NEXT: } # SYMBOL64-NEXT: ] diff --git a/llvm/tools/llvm-readobj/XCOFFDumper.cpp b/llvm/tools/llvm-readobj/XCOFFDumper.cpp index 6a099c08e1aca..03764e9ba483d 100644 --- a/llvm/tools/llvm-readobj/XCOFFDumper.cpp +++ b/llvm/tools/llvm-readobj/XCOFFDumper.cpp @@ -692,22 +692,26 @@ static StringRef GetSymbolValueName(XCOFF::StorageClass SC) { case XCOFF::C_BLOCK: return "Value (RelocatableAddress)"; case XCOFF::C_FILE: + case XCOFF::C_BSTAT: return "Value (SymbolTableIndex)"; case XCOFF::C_DWARF: return "Value (OffsetInDWARF)"; case XCOFF::C_FUN: case XCOFF::C_STSYM: + return "Value (OffsetInCSect)"; case XCOFF::C_BINCL: case XCOFF::C_EINCL: + return "Value (OffsetInFile)"; case XCOFF::C_INFO: - case XCOFF::C_BSTAT: + return "Value (OffsetInCommentSection)"; case XCOFF::C_LSYM: case XCOFF::C_PSYM: + return "Value (OffsetRelToStackFrame)"; case XCOFF::C_RPSYM: case XCOFF::C_RSYM: + return "Value (RegisterNumber)"; case XCOFF::C_ECOML: - assert(false && "This StorageClass for the symbol is not yet implemented."); - return ""; + return "Value (OffsetInCommBlock)"; default: return "Value"; } From 3019e49ebfc5d710191712b6d437c56c01e65b87 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 10 Feb 2025 14:47:13 +0000 Subject: [PATCH 155/293] SCEV: thread samesign in isBasicBlockEntryGuardedByCond (NFC) (#125840) isBasicBlockEntryGuardedByCond inadvertedenly drops samesign information when calling ICmpInst::getNonStrictPredicate. Fix this. --- llvm/include/llvm/IR/Instructions.h | 12 ++++++++++++ llvm/lib/Analysis/ScalarEvolution.cpp | 5 +++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 9a41971b63373..a1f964352207f 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -1231,6 +1231,18 @@ class ICmpInst: public CmpInst { return getSwappedCmpPredicate(getCmpPredicate()); } + /// @returns the non-strict predicate along with samesign information: static + /// variant. + static CmpPredicate getNonStrictCmpPredicate(CmpPredicate Pred) { + return {getNonStrictPredicate(Pred), Pred.hasSameSign()}; + } + + /// For example, SGT -> SGE, SLT -> SLE, ULT -> ULE, UGT -> UGE. + /// @returns the non-strict predicate along with samesign information. + Predicate getNonStrictCmpPredicate() const { + return getNonStrictCmpPredicate(getCmpPredicate()); + } + /// For example, EQ->EQ, SLE->SLE, UGT->SGT, etc. /// @returns the predicate that would be the result if the operand were /// regarded as signed. diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 573b052aa4b2c..8f74c1c398ced 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -11632,8 +11632,9 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, // non-strict comparison is known from ranges and non-equality is known from // dominating predicates. If we are proving strict comparison, we always try // to prove non-equality and non-strict comparison separately. - auto NonStrictPredicate = ICmpInst::getNonStrictPredicate(Pred); - const bool ProvingStrictComparison = (Pred != NonStrictPredicate); + CmpPredicate NonStrictPredicate = ICmpInst::getNonStrictCmpPredicate(Pred); + const bool ProvingStrictComparison = + (Pred != static_cast(NonStrictPredicate)); bool ProvedNonStrictComparison = false; bool ProvedNonEquality = false; From 36530414e3fc49ce9c5a74acf3a68731965ea4d6 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 10 Feb 2025 23:43:16 +0800 Subject: [PATCH 156/293] [RISCV][VLOPT] Add support for Vector Fixed-Point Arithmetic Instructions (#126483) This patch adds the remaining support for fixed-point arithmetic instructions (we previously had support for averaging adds and subtracts). For saturating adds/subs/multiplies/clips, we can't change `vl` if `vxsat` is used, since changing `vl` may change its value. So this patch checks to see if it's dead before considering it a candidate. --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 38 +- .../RISCV/rvv/rvv-peephole-vmerge-vops.ll | 3 +- llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll | 508 ++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vl-opt.mir | 20 + 4 files changed, 567 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 28f89df9554b5..1ba7f0b522a2b 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -979,6 +979,17 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VMV_V_I: case RISCV::VMV_V_X: case RISCV::VMV_V_V: + // Vector Single-Width Saturating Add and Subtract + case RISCV::VSADDU_VV: + case RISCV::VSADDU_VX: + case RISCV::VSADDU_VI: + case RISCV::VSADD_VV: + case RISCV::VSADD_VX: + case RISCV::VSADD_VI: + case RISCV::VSSUBU_VV: + case RISCV::VSSUBU_VX: + case RISCV::VSSUB_VV: + case RISCV::VSSUB_VX: // Vector Single-Width Averaging Add and Subtract case RISCV::VAADDU_VV: case RISCV::VAADDU_VX: @@ -988,6 +999,23 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VASUBU_VX: case RISCV::VASUB_VV: case RISCV::VASUB_VX: + // Vector Single-Width Fractional Multiply with Rounding and Saturation + case RISCV::VSMUL_VV: + case RISCV::VSMUL_VX: + // Vector Single-Width Scaling Shift Instructions + case RISCV::VSSRL_VV: + case RISCV::VSSRL_VX: + case RISCV::VSSRL_VI: + case RISCV::VSSRA_VV: + case RISCV::VSSRA_VX: + case RISCV::VSSRA_VI: + // Vector Narrowing Fixed-Point Clip Instructions + case RISCV::VNCLIPU_WV: + case RISCV::VNCLIPU_WX: + case RISCV::VNCLIPU_WI: + case RISCV::VNCLIP_WV: + case RISCV::VNCLIP_WX: + case RISCV::VNCLIP_WI: // Vector Crypto case RISCV::VWSLL_VI: @@ -1187,8 +1215,16 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const { const MCInstrDesc &Desc = MI.getDesc(); if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) return false; - if (MI.getNumDefs() != 1) + + if (MI.getNumExplicitDefs() != 1) + return false; + + // Some instructions have implicit defs e.g. $vxsat. If they might be read + // later then we can't reduce VL. + if (!MI.allImplicitDefsAreDead()) { + LLVM_DEBUG(dbgs() << "Not a candidate because has non-dead implicit def\n"); return false; + } if (MI.mayRaiseFPException()) { LLVM_DEBUG(dbgs() << "Not a candidate because may raise FP exception\n"); diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll index d329979857a6b..403cc0eb9dce1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll @@ -894,9 +894,10 @@ define void @test_dag_loop() { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vmclr.m v0 +; CHECK-NEXT: vsetivli zero, 0, e8, m4, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vmv.v.i v12, 0 -; CHECK-NEXT: vsetivli zero, 0, e8, m4, tu, mu +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, mu ; CHECK-NEXT: vssubu.vx v12, v8, zero, v0.t ; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma ; CHECK-NEXT: vmseq.vv v0, v12, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll index f4591a191c8b7..c6ee9e34dc207 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll @@ -2511,6 +2511,514 @@ define @vwmaccus_vx( %a, i16 %b, %2 } +define @vsaddu_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vsaddu_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vsaddu.vv v10, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vsaddu_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vsaddu.vv v10, v8, v10 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vsaddu( poison, %a, %b, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vsaddu_vx( %a, i32 %b, iXLen %vl) { +; NOVLOPT-LABEL: vsaddu_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vsaddu.vx v10, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vsaddu_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vsaddu.vx v10, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vsaddu( poison, %a, i32 %b, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vsaddu_vi( %a, iXLen %vl) { +; NOVLOPT-LABEL: vsaddu_vi: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vsaddu.vi v10, v8, 5 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vsaddu_vi: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vsaddu.vi v10, v8, 5 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vsaddu( poison, %a, i32 5, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vsadd_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vsadd_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vsadd.vv v10, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vsadd_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vsadd.vv v10, v8, v10 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vsadd( poison, %a, %b, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vsadd_vx( %a, i32 %b, iXLen %vl) { +; NOVLOPT-LABEL: vsadd_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vsadd.vx v10, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vsadd_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vsadd.vx v10, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vsadd( poison, %a, i32 %b, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vsadd_vi( %a, iXLen %vl) { +; NOVLOPT-LABEL: vsadd_vi: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vsadd.vi v10, v8, 5 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vsadd_vi: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vsadd.vi v10, v8, 5 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vsadd( poison, %a, i32 5, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vssubu_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vssubu_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssubu.vv v10, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssubu_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vssubu.vv v10, v8, v10 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssubu( poison, %a, %b, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vssubu_vx( %a, i32 %b, iXLen %vl) { +; NOVLOPT-LABEL: vssubu_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssubu.vx v10, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssubu_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vssubu.vx v10, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssubu( poison, %a, i32 %b, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vssub_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vssub_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssub.vv v10, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssub_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vssub.vv v10, v8, v10 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssub( poison, %a, %b, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vssub_vx( %a, i32 %b, iXLen %vl) { +; NOVLOPT-LABEL: vssub_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssub.vx v10, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssub_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vssub.vx v10, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssub( poison, %a, i32 %b, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vsmul_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vsmul_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vsmul.vv v8, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v8, v10 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vsmul_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vsmul.vv v8, v8, v10 +; VLOPT-NEXT: vadd.vv v8, v8, v10 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vsmul.nxv4i32.nxv4i32( poison, %a, %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %1, %b, iXLen %vl) + ret %2 +} + +define @vsmul_vx( %a, i32 %b, iXLen %vl) { +; NOVLOPT-LABEL: vsmul_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vsmul.vx v10, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vsmul_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vsmul.vx v10, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vsmul.nxv4i32.nxv4i32( poison, %a, i32 %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vssrl_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vssrl_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssrl.vv v8, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v8, v10 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssrl_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vssrl.vv v8, v8, v10 +; VLOPT-NEXT: vadd.vv v8, v8, v10 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssrl.nxv4i32.nxv4i32( poison, %a, %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %1, %b, iXLen %vl) + ret %2 +} + +define @vssrl_vx( %a, iXLen %b, iXLen %vl) { +; NOVLOPT-LABEL: vssrl_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssrl.vx v10, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssrl_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vssrl.vx v10, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssrl.nxv4i32.nxv4i32( poison, %a, iXLen %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vssrl_vi( %a, iXLen %vl) { +; NOVLOPT-LABEL: vssrl_vi: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssrl.vi v10, v8, 5 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssrl_vi: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vssrl.vi v10, v8, 5 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssrl.nxv4i32.nxv4i32( poison, %a, iXLen 5, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vssra_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vssra_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssra.vv v8, v8, v10 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v8, v10 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssra_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vssra.vv v8, v8, v10 +; VLOPT-NEXT: vadd.vv v8, v8, v10 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssra.nxv4i32.nxv4i32( poison, %a, %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %1, %b, iXLen %vl) + ret %2 +} + +define @vssra_vx( %a, iXLen %b, iXLen %vl) { +; NOVLOPT-LABEL: vssra_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssra.vx v10, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssra_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vssra.vx v10, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssra.nxv4i32.nxv4i32( poison, %a, iXLen %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vssra_vi( %a, iXLen %vl) { +; NOVLOPT-LABEL: vssra_vi: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vssra.vi v10, v8, 5 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v10, v8 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vssra_vi: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vssra.vi v10, v8, 5 +; VLOPT-NEXT: vadd.vv v8, v10, v8 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vssra.nxv4i32.nxv4i32( poison, %a, iXLen 5, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %1, %a, iXLen %vl) + ret %2 +} + +define @vnclipu_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vnclipu_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vnclipu.wv v14, v8, v12 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v14, v14 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vnclipu_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vnclipu.wv v14, v8, v12 +; VLOPT-NEXT: vadd.vv v8, v14, v14 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vnclipu( poison, %a, %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vnclipu_vx( %a, iXLen %b, iXLen %vl) { +; NOVLOPT-LABEL: vnclipu_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vnclipu.wx v12, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vnclipu_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vnclipu.wx v12, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vnclipu( poison, %a, iXLen %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vnclipu_vi( %a, iXLen %vl) { +; NOVLOPT-LABEL: vnclipu_vi: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vnclipu.wi v12, v8, 5 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vnclipu_vi: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vnclipu.wi v12, v8, 5 +; VLOPT-NEXT: vadd.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vnclipu( poison, %a, iXLen 5, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vnclip_vv( %a, %b, iXLen %vl) { +; NOVLOPT-LABEL: vnclip_vv: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vnclip.wv v14, v8, v12 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v14, v14 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vnclip_vv: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vnclip.wv v14, v8, v12 +; VLOPT-NEXT: vadd.vv v8, v14, v14 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vnclip( poison, %a, %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vnclip_vx( %a, iXLen %b, iXLen %vl) { +; NOVLOPT-LABEL: vnclip_vx: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vnclip.wx v12, v8, a0 +; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vnclip_vx: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; VLOPT-NEXT: vnclip.wx v12, v8, a0 +; VLOPT-NEXT: vadd.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vnclip( poison, %a, iXLen %b, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %1, iXLen %vl) + ret %2 +} + +define @vnclip_vi( %a, iXLen %vl) { +; NOVLOPT-LABEL: vnclip_vi: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: csrwi vxrm, 0 +; NOVLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; NOVLOPT-NEXT: vnclip.wi v12, v8, 5 +; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; NOVLOPT-NEXT: vadd.vv v8, v12, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vnclip_vi: +; VLOPT: # %bb.0: +; VLOPT-NEXT: csrwi vxrm, 0 +; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vnclip.wi v12, v8, 5 +; VLOPT-NEXT: vadd.vv v8, v12, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vnclip( poison, %a, iXLen 5, iXLen 0, iXLen -1) + %2 = call @llvm.riscv.vadd( poison, %1, %1, iXLen %vl) + ret %2 +} + define @vmv_v_i( %a, i32 %x, iXLen %vl) { ; NOVLOPT-LABEL: vmv_v_i: ; NOVLOPT: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir index 78054c73d848f..0475a988e9851 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir @@ -270,3 +270,23 @@ body: | %a:vr = PseudoVADD_VV_M1 %z, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ %b:vr = PseudoVADD_VV_M1 $noreg, %a, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */ ... +--- +name: vxsat_dead +body: | + bb.0: + ; CHECK-LABEL: name: vxsat_dead + ; CHECK: %x:vr = PseudoVSADDU_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vxsat + ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + %x:vr = PseudoVSADDU_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vxsat + %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ +... +--- +name: vxsat_not_dead +body: | + bb.0: + ; CHECK-LABEL: name: vxsat_not_dead + ; CHECK: %x:vr = PseudoVSADDU_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */, implicit-def $vxsat + ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ + %x:vr = PseudoVSADDU_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */, implicit-def $vxsat + %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */ +... From 280d2a3035ad362cb9dab9f59aa9bdbb88723e9e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 10 Feb 2025 07:48:57 -0800 Subject: [PATCH 157/293] [AST] Avoid repeated hash lookups (NFC) (#126461) --- clang/lib/AST/RawCommentList.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/RawCommentList.cpp b/clang/lib/AST/RawCommentList.cpp index fddedd3a31856..9658c6ab3d39d 100644 --- a/clang/lib/AST/RawCommentList.cpp +++ b/clang/lib/AST/RawCommentList.cpp @@ -287,13 +287,13 @@ void RawCommentList::addComment(const RawComment &RC, // If this is the first Doxygen comment, save it (because there isn't // anything to merge it with). - if (OrderedComments[CommentFile].empty()) { - OrderedComments[CommentFile][CommentOffset] = - new (Allocator) RawComment(RC); + auto &OC = OrderedComments[CommentFile]; + if (OC.empty()) { + OC[CommentOffset] = new (Allocator) RawComment(RC); return; } - const RawComment &C1 = *OrderedComments[CommentFile].rbegin()->second; + const RawComment &C1 = *OC.rbegin()->second; const RawComment &C2 = RC; // Merge comments only if there is only whitespace between them. From eaedfc0e5299d43dda28346eb2a5b068a8bee58d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 10 Feb 2025 07:49:17 -0800 Subject: [PATCH 158/293] [Lex] Avoid repeated hash lookups (NFC) (#126462) --- clang/lib/Lex/ModuleMap.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index ccf94f6345ff2..998e2b977d109 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -2928,9 +2928,10 @@ void ModuleMapParser::parseInferredModuleDecl(bool Framework, bool Explicit) { ActiveModule->InferExplicitSubmodules = Explicit; } else { // We'll be inferring framework modules for this directory. - Map.InferredDirectories[Directory].InferModules = true; - Map.InferredDirectories[Directory].Attrs = Attrs; - Map.InferredDirectories[Directory].ModuleMapFID = ModuleMapFID; + auto &InfDir = Map.InferredDirectories[Directory]; + InfDir.InferModules = true; + InfDir.Attrs = Attrs; + InfDir.ModuleMapFID = ModuleMapFID; // FIXME: Handle the 'framework' keyword. } From ba9810e803744974157e85a80854e163818db608 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 10 Feb 2025 07:49:42 -0800 Subject: [PATCH 159/293] [TableGen] Avoid repeated hash lookups (NFC) (#126464) --- clang/utils/TableGen/MveEmitter.cpp | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp index 014b20667e03e..7bee2996382c1 100644 --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -1629,17 +1629,10 @@ void EmitterBase::EmitBuiltinCG(raw_ostream &OS) { for (const auto &OI : kv.second) key.push_back(OI.ParamValues[i]); - auto Found = ParamNumberMap.find(key); - if (Found != ParamNumberMap.end()) { - // Yes, an existing parameter variable can be reused for this. - ParamNumbers.push_back(Found->second); - continue; - } - - // No, we need a new parameter variable. - int ExistingIndex = ParamNumberMap.size(); - ParamNumberMap[key] = ExistingIndex; - ParamNumbers.push_back(ExistingIndex); + // Obtain a new parameter variable if we don't have one. + int ParamNum = + ParamNumberMap.try_emplace(key, ParamNumberMap.size()).first->second; + ParamNumbers.push_back(ParamNum); } // Now we're ready to do the pass 2 code generation, which will emit the From de563951b7740b3f2e1b3a07362e7890e09624ec Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 10 Feb 2025 07:50:12 -0800 Subject: [PATCH 160/293] [Analysis] Avoid repeated hash lookups (NFC) (#126465) --- llvm/include/llvm/Analysis/RegionInfoImpl.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h index db3a9027549ce..eb99d8bc6fb23 100644 --- a/llvm/include/llvm/Analysis/RegionInfoImpl.h +++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h @@ -720,16 +720,14 @@ void RegionInfoBase::buildRegionsTree(DomTreeNodeT *N, RegionT *region) { while (BB == region->getExit()) region = region->getParent(); - typename BBtoRegionMap::iterator it = BBtoRegion.find(BB); + auto [It, Inserted] = BBtoRegion.try_emplace(BB, region); // This basic block is a start block of a region. It is already in the // BBtoRegion relation. Only the child basic blocks have to be updated. - if (it != BBtoRegion.end()) { - RegionT *newRegion = it->second; + if (!Inserted) { + RegionT *newRegion = It->second; region->addSubRegion(getTopMostParent(newRegion)); region = newRegion; - } else { - BBtoRegion[BB] = region; } for (DomTreeNodeBase *C : *N) { From 2f88672414b4e9c74c47718c9979c79ba4c40e04 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 10 Feb 2025 07:50:32 -0800 Subject: [PATCH 161/293] [Coroutines] Avoid repeated hash lookups (NFC) (#126466) --- llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp index 3686c7c153999..5021425152f6c 100644 --- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp @@ -70,11 +70,12 @@ struct RematGraph { std::deque> &WorkList, User *FirstUse) { RematNode *N = NUPtr.get(); - if (Remats.count(N->Node)) + auto [It, Inserted] = Remats.try_emplace(N->Node); + if (!Inserted) return; // We haven't see this node yet - add to the list - Remats[N->Node] = std::move(NUPtr); + It->second = std::move(NUPtr); for (auto &Def : N->Node->operands()) { Instruction *D = dyn_cast(Def.get()); if (!D || !MaterializableCallback(*D) || From 6228379a6c98d90d81db1a7b15f9682b7b01fb90 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 10 Feb 2025 07:50:57 -0800 Subject: [PATCH 162/293] [llvm-profgen] Avoid repeated hash lookups (NFC) (#126467) --- llvm/tools/llvm-profgen/MissingFrameInferrer.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp b/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp index ee49950f39ca4..eefe38cd3fa00 100644 --- a/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp +++ b/llvm/tools/llvm-profgen/MissingFrameInferrer.cpp @@ -206,11 +206,12 @@ uint64_t MissingFrameInferrer::computeUniqueTailCallPath( uint64_t MissingFrameInferrer::computeUniqueTailCallPath( uint64_t From, BinaryFunction *To, SmallVectorImpl &Path) { - if (!TailCallEdgesF.count(From)) + auto It = TailCallEdgesF.find(From); + if (It == TailCallEdgesF.end()) return 0; Path.push_back(From); uint64_t NumPaths = 0; - for (auto Target : TailCallEdgesF[From]) { + for (auto Target : It->second) { NumPaths += computeUniqueTailCallPath(Target, To, Path); // Stop analyzing the remaining if we are already seeing more than one // reachable paths. From 783275eb7b3ecde63bdb6ac1316c090bfc568bdd Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 10 Feb 2025 10:57:22 -0500 Subject: [PATCH 163/293] [clang] Handle f(no-)strict-overflow, f(no-)wrapv, f(no-)wrapv-pointer like gcc (#126524) We now process all 6 options left-to-right and pick whatever is active at the end. Fixes #124868. --- clang/lib/Driver/ToolChains/CommonArgs.cpp | 48 +++++++++++++++------- clang/test/Driver/clang_wrapv_opts.c | 24 +++++------ 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 61917db4d780d..9a4d3f55c911c 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -3096,21 +3096,39 @@ bool tools::shouldRecordCommandLine(const ToolChain &TC, void tools::renderCommonIntegerOverflowOptions(const ArgList &Args, ArgStringList &CmdArgs) { - // -fno-strict-overflow implies -fwrapv if it isn't disabled, but - // -fstrict-overflow won't turn off an explicitly enabled -fwrapv. - bool StrictOverflow = Args.hasFlag(options::OPT_fstrict_overflow, - options::OPT_fno_strict_overflow, true); - if (Arg *A = Args.getLastArg(options::OPT_fwrapv, options::OPT_fno_wrapv)) { - if (A->getOption().matches(options::OPT_fwrapv)) - CmdArgs.push_back("-fwrapv"); - } else if (!StrictOverflow) { - CmdArgs.push_back("-fwrapv"); + bool use_fwrapv = false; + bool use_fwrapv_pointer = false; + for (const Arg *A : Args.filtered( + options::OPT_fstrict_overflow, options::OPT_fno_strict_overflow, + options::OPT_fwrapv, options::OPT_fno_wrapv, + options::OPT_fwrapv_pointer, options::OPT_fno_wrapv_pointer)) { + A->claim(); + switch (A->getOption().getID()) { + case options::OPT_fstrict_overflow: + use_fwrapv = false; + use_fwrapv_pointer = false; + break; + case options::OPT_fno_strict_overflow: + use_fwrapv = true; + use_fwrapv_pointer = true; + break; + case options::OPT_fwrapv: + use_fwrapv = true; + break; + case options::OPT_fno_wrapv: + use_fwrapv = false; + break; + case options::OPT_fwrapv_pointer: + use_fwrapv_pointer = true; + break; + case options::OPT_fno_wrapv_pointer: + use_fwrapv_pointer = false; + break; + } } - if (Arg *A = Args.getLastArg(options::OPT_fwrapv_pointer, - options::OPT_fno_wrapv_pointer)) { - if (A->getOption().matches(options::OPT_fwrapv_pointer)) - CmdArgs.push_back("-fwrapv-pointer"); - } else if (!StrictOverflow) { + + if (use_fwrapv) + CmdArgs.push_back("-fwrapv"); + if (use_fwrapv_pointer) CmdArgs.push_back("-fwrapv-pointer"); - } } diff --git a/clang/test/Driver/clang_wrapv_opts.c b/clang/test/Driver/clang_wrapv_opts.c index 9f3a884324dcd..295d8deb0d99d 100644 --- a/clang/test/Driver/clang_wrapv_opts.c +++ b/clang/test/Driver/clang_wrapv_opts.c @@ -1,20 +1,20 @@ // RUN: %clang -### -S -fwrapv -fno-wrapv -fwrapv -Werror %s 2>&1 | FileCheck -check-prefix=CHECK1 %s // CHECK1: "-fwrapv" -// + // RUN: %clang -### -S -fwrapv-pointer -fno-wrapv-pointer -fwrapv-pointer -Werror %s 2>&1 | FileCheck -check-prefix=CHECK1-POINTER %s // CHECK1-POINTER: "-fwrapv-pointer" -// + // RUN: %clang -### -S -fstrict-overflow -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK2 %s // CHECK2: "-fwrapv"{{.*}}"-fwrapv-pointer" -// + // RUN: %clang -### -S -fwrapv -fstrict-overflow -Werror -Werror %s 2>&1 | FileCheck -check-prefix=CHECK3 %s --implicit-check-not="-fwrapv-pointer" -// CHECK3: "-fwrapv" -// +// CHECK3-NOT: "-fwrapv" + // RUN: %clang -### -S -fwrapv-pointer -fstrict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK3-POINTER %s --implicit-check-not="-fwrapv" -// CHECK3-POINTER: "-fwrapv-pointer" -// -// RUN: %clang -### -S -fno-wrapv -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4 %s --implicit-check-not="-fwrapv" -// CHECK4: "-fwrapv-pointer" -// -// RUN: %clang -### -S -fno-wrapv-pointer -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4-POINTER %s --implicit-check-not="-fwrapv-pointer" -// CHECK4-POINTER: "-fwrapv" +// CHECK3-POINTER-NOT: "-fwrapv-pointer" + +// RUN: %clang -### -S -fno-wrapv -fno-strict-overflow -fno-wrapv-pointer -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4 %s --implicit-check-not="-fwrapv-pointer" +// CHECK4: "-fwrapv" + +// RUN: %clang -### -S -fno-wrapv-pointer -fno-strict-overflow -fno-wrapv -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4-POINTER %s --implicit-check-not="-fwrapv" +// CHECK4-POINTER: "-fwrapv-pointer" From 308d28667c14e7c14d8688cd19201308e07c8721 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 10 Feb 2025 10:58:16 -0500 Subject: [PATCH 164/293] [llvm][docs] Tweak backporting instructions a bit (#126519) * Drop ".Z" in milestone name since we've been doing X.Y releases instead of X.Y.Z releases since LLVM 18 * Add "LLVM" prefix since that's what release milestones are named * Use a numbered list to make it clearer that there are two steps needed, and add some more details to the first step --- llvm/docs/GitHub.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index 892b8abcc2d4e..b5b75db91e1c4 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -438,9 +438,13 @@ Releases Backporting Fixes to the Release Branches ----------------------------------------- You can use special comments on issues or pull requests to make backport -requests for the release branches. This is done by making a comment containing -the following command on any issue or pull request that has been added to one -of the "X.Y.Z Release" milestones. +requests for the release branches. To do this, after your pull reuest has been +merged: + +1. Edit "Milestone" at the right side of the isssue or pull request + to say "LLVM X.Y Release" + +2. Add a comment to it in the following format: :: From 1c583c19bb7914a2686e245b7e1d14f82fe454eb Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Mon, 10 Feb 2025 08:03:38 -0800 Subject: [PATCH 165/293] [acc][mlir] Add functionality for categorizing OpenACC variable types (#126167) OpenACC specification describes the following type categories: scalar, array, composite, and aggregate (which includes arrays, composites, and others such as Fortran pointer/allocatable). Decision for how to do implicit mapping is dependent on a variable's category. Since acc dialect's only means of distinguishing between types is through the interfaces attached, add API to be able to get the type category. In addition to defining the new API, attempt to provide a base implementation for memref which matches what OpenACC spec describes. --- .../mlir/Dialect/OpenACC/OpenACCOps.td | 68 +++++++++++++++++++ .../Dialect/OpenACC/OpenACCTypeInterfaces.td | 36 +++++++++- mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 33 ++++++++- 3 files changed, 135 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index df0d408d0f0e1..42da20251c190 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -65,6 +65,74 @@ def OpenACC_ReductionOperatorAttr : EnumAttr` }]; } +// OpenACC variable type categorization. This is needed because OpenACC +// dialect is used with other dialects, and each dialect defines its own +// types. Thus, in order to be able to classify types and apply right semantics, +// it is needed to ensure the types can be categorized. +def OpenACC_VariableTypeUncategorized : I32BitEnumAttrCaseNone<"uncategorized">; + +// The OpenACC spec definition of scalar type is as follows (from 3.3 spec, +// line 5454): +// Scalar datatype - an intrinsic or built-in datatype that is not an array or +// aggregate datatype. In Fortran, scalar datatypes are integer, real, double +// precision, complex, or logical. In C, scalar datatypes are char (signed or +// unsigned), int (signed or unsigned, with optional short, long or long long +// attribute), enum, float, double, long double, Complex (with optional float +// or long attribute), or any pointer datatype. In C++, scalar datatypes are +// char (signed or unsigned), wchar t, int (signed or unsigned, with optional +// short, long or long long attribute), enum, bool, float, double, long double, +// or any pointer datatype. Not all implementations or targets will support all +// of these datatypes. +// From an MLIR type perspective, the types that those language types map to +// will be categorized as scalar. +def OpenACC_VariableTypeScalar : I32BitEnumAttrCaseBit<"scalar", 0>; + +// Not in OpenACC spec glossary as its own definition but used throughout the +// spec. One definition of array that can be assumed for purposes of type +// categorization is that it is a collection of elements of same type. +def OpenACC_VariableTypeArray : I32BitEnumAttrCaseBit<"array", 1>; + +// The OpenACC spec definition of composite type is as follows (from 3.3 spec, +// line 5354): +// Composite datatype - a derived type in Fortran, or a struct or union type in +// C, or a class, struct, or union type in C++. (This is different from the use +// of the term composite data type in the C and C++ languages.) +def OpenACC_VariableTypeComposite : I32BitEnumAttrCaseBit<"composite", 2>; + +// The OpenACC spec uses the type category "aggregate" to capture both arrays +// and composite types. However, it includes types which do not fall in either +// of those categories. Thus create a case for the others. +// For example, reading the definition of "Aggregate Variables" in the 3.3 +// spec line 5346 shows this distinction: +// Aggregate variables - a variable of any non-scalar datatype, including array +// or composite variables. In Fortran, this includes any variable with +// allocatable or pointer attribute and character variables +def OpenACC_VariableTypeOtherNonScalar : I32BitEnumAttrCaseBit<"nonscalar", 3>; + +// The OpenACC spec definition of aggregate type is as follows (from 3.3 spec, +// line 5342): +// Aggregate datatype - any non-scalar datatype such as array and composite +// datatypes. In Fortran, aggregate datatypes include arrays, derived types, +// character types. In C, aggregate datatypes include arrays, targets of +// pointers, structs, and unions. In C++, aggregate datatypes include arrays, +// targets of pointers, classes, structs, and unions. +def OpenACC_VariableTypeAggregate : I32BitEnumAttrCaseGroup<"aggregate", + [OpenACC_VariableTypeArray, OpenACC_VariableTypeComposite, + OpenACC_VariableTypeOtherNonScalar]>; + +def OpenACC_VariableTypeCategory : I32BitEnumAttr< + "VariableTypeCategory", + "Captures different type categories described in OpenACC spec", + [ + OpenACC_VariableTypeUncategorized, OpenACC_VariableTypeScalar, + OpenACC_VariableTypeArray, OpenACC_VariableTypeComposite, + OpenACC_VariableTypeOtherNonScalar, OpenACC_VariableTypeAggregate]> { + let separator = ","; + let cppNamespace = "::mlir::acc"; + let genSpecializedAttr = 0; + let printBitEnumPrimaryGroups = 1; +} + // Type used in operation below. def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>; diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td index bec46be89f058..fa66a4746f7da 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td @@ -28,6 +28,28 @@ def OpenACC_PointerLikeTypeInterface : TypeInterface<"PointerLikeType"> { /*retTy=*/"::mlir::Type", /*methodName=*/"getElementType" >, + InterfaceMethod< + /*description=*/[{ + Returns the type category of the pointee. The `var` is provided because + a dialect's type system may be incomplete. For example, consider a + dialect which computes interior pointers - so a float array element + may be represented as `ptr`. The type system says the pointee + is `f32` but this is not a scalar from the point-of-view of OpenACC. + It is an array element and thus the appropriate type category is + "array" - therefore being able to look up how a variable is computed + is important for a complete type determination. + The `varType` is provided in cases where a dialect's type system + erased the target type. + }], + /*retTy=*/"::mlir::acc::VariableTypeCategory", + /*methodName=*/"getPointeeTypeCategory", + /*args=*/(ins "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$varPtr, + "::mlir::Type":$varType), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return ::mlir::acc::VariableTypeCategory::uncategorized; + }] + >, ]; } @@ -106,7 +128,7 @@ def OpenACC_MappableTypeInterface : TypeInterface<"MappableType"> { return {}; }] >, - InterfaceMethod< + InterfaceMethod< /*description=*/[{ Returns explicit `acc.bounds` operations that envelop the whole data structure. These operations are inserted using the provided builder @@ -121,6 +143,18 @@ def OpenACC_MappableTypeInterface : TypeInterface<"MappableType"> { return {}; }] >, + InterfaceMethod< + /*description=*/[{ + Returns the OpenACC type category. + }], + /*retTy=*/"::mlir::acc::VariableTypeCategory", + /*methodName=*/"getTypeCategory", + /*args=*/(ins "::mlir::Value":$var), + /*methodBody=*/"", + /*defaultImplementation=*/[{ + return ::mlir::acc::VariableTypeCategory::uncategorized; + }] + >, ]; } diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 9ebce4db854dc..aaa3db22110ac 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -32,11 +32,42 @@ using namespace acc; #include "mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.cpp.inc" namespace { + +static bool isScalarLikeType(Type type) { + return type.isIntOrIndexOrFloat() || isa(type); +} + struct MemRefPointerLikeModel : public PointerLikeType::ExternalModel { Type getElementType(Type pointer) const { - return llvm::cast(pointer).getElementType(); + return cast(pointer).getElementType(); + } + mlir::acc::VariableTypeCategory + getPointeeTypeCategory(Type pointer, TypedValue varPtr, + Type varType) const { + if (auto mappableTy = dyn_cast(varType)) { + return mappableTy.getTypeCategory(varPtr); + } + auto memrefTy = cast(pointer); + if (!memrefTy.hasRank()) { + // This memref is unranked - aka it could have any rank, including a + // rank of 0 which could mean scalar. For now, return uncategorized. + return mlir::acc::VariableTypeCategory::uncategorized; + } + + if (memrefTy.getRank() == 0) { + if (isScalarLikeType(memrefTy.getElementType())) { + return mlir::acc::VariableTypeCategory::scalar; + } + // Zero-rank non-scalar - need further analysis to determine the type + // category. For now, return uncategorized. + return mlir::acc::VariableTypeCategory::uncategorized; + } + + // It has a rank - must be an array. + assert(memrefTy.getRank() > 0 && "rank expected to be positive"); + return mlir::acc::VariableTypeCategory::array; } }; From 0010a3c97ef4df11aa50b381ea801c9ba8dd516f Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 10 Feb 2025 16:04:43 +0000 Subject: [PATCH 166/293] [NFC][LoopVectorize] Add more partial reduction tests (#126525) * Adds variants of dotp (dotp_i8_to_i64_has_neon_dotprod, dotp_i16_to_i64_has_neon_dotprod) that show how the loop vectoriser has generated fixed-width partial reductions without any matching NEON udot instruction. * Adds loops that could also benefit from partial reductions once the work is done to recognise patterns such as %zext = zext i8 %load to i32 %acc.next = add i32 %acc, %zext See zext_add_reduc_i8_i32, etc. I intend to follow up with a patch to add support for vectorising such patterns. --- .../AArch64/partial-reduce-dot-product.ll | 1461 +++++++++++++++-- 1 file changed, 1364 insertions(+), 97 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 2aaea965cb645..26b5f07fbaed7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 4 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 ; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW @@ -12,7 +12,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -20,10 +20,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX1]], 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]] ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0 @@ -37,18 +37,18 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] -; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: for.body: -; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 @@ -59,9 +59,9 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( @@ -69,7 +69,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-INTERLEAVED-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 @@ -77,11 +77,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ] +; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX1]], 0 ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]] ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 @@ -107,19 +107,19 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: -; CHECK-INTERLEAVED-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ] -; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVED: for.body: -; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 ; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 @@ -130,9 +130,9 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-MAXBW-LABEL: define i32 @dotp( @@ -214,6 +214,422 @@ for.exit: ; preds = %for.body ret i32 %add } +define i64 @dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b) #1 { +; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_i8_to_i64_has_neon_dotprod( +; CHECK-INTERLEAVE1-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 1024 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 1024 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = mul nuw nsw <16 x i64> [[TMP7]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP8]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = phi ptr [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[GEP_A_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[GEP_B_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A_NEXT]] = getelementptr inbounds nuw i8, ptr [[GEP_A]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B_NEXT]] = getelementptr inbounds nuw i8, ptr [[GEP_B]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @dotp_i8_to_i64_has_neon_dotprod( +; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 1024 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 1024 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul nuw nsw <16 x i64> [[TMP7]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP8]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = phi ptr [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[GEP_A_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[GEP_B_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A_NEXT]] = getelementptr inbounds nuw i8, ptr [[GEP_A]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[GEP_B_NEXT]] = getelementptr inbounds nuw i8, ptr [[GEP_B]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i64 @dotp_i8_to_i64_has_neon_dotprod( +; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw [[TMP13]], [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP15]] = add [[TMP14]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP15]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = phi ptr [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[GEP_A_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[GEP_B_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A_NEXT]] = getelementptr inbounds nuw i8, ptr [[GEP_A]], i64 1 +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[GEP_B_NEXT]] = getelementptr inbounds nuw i8, ptr [[GEP_B]], i64 1 +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %gep.a = phi ptr [ %a, %entry ], [ %gep.a.next, %for.body ] + %gep.b = phi ptr [ %b, %entry ], [ %gep.b.next, %for.body ] + %gep.a.next = getelementptr inbounds nuw i8, ptr %gep.a, i64 1 + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i64 + %gep.b.next = getelementptr inbounds nuw i8, ptr %gep.b, i64 1 + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i64 + %mul = mul nuw nsw i64 %ext.b, %ext.a + %add = add nsw i64 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %add +} + +define i64 @dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b) #1 { +; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_i16_to_i64_has_neon_dotprod( +; CHECK-INTERLEAVE1-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 2048 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 2048 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX1]], 0 +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[NEXT_GEP2]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[TMP6]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = mul nuw nsw <8 x i64> [[TMP7]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP8]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[GEP_A_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = phi ptr [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ], [ [[GEP_B_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A_NEXT]] = getelementptr inbounds nuw i16, ptr [[GEP_A]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B_NEXT]] = getelementptr inbounds nuw i16, ptr [[GEP_B]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @dotp_i16_to_i64_has_neon_dotprod( +; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 2048 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 2048 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX2]], 0 +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP8]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nuw nsw <8 x i64> [[TMP10]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nuw nsw <8 x i64> [[TMP11]], [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP13]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL9:%.*]] = phi ptr [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = phi ptr [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[GEP_A_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = phi ptr [ [[BC_RESUME_VAL9]], [[SCALAR_PH]] ], [ [[GEP_B_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A_NEXT]] = getelementptr inbounds nuw i16, ptr [[GEP_A]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[GEP_B_NEXT]] = getelementptr inbounds nuw i16, ptr [[GEP_B]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i64 @dotp_i16_to_i64_has_neon_dotprod( +; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX1]], 0 +; CHECK-MAXBW-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 2 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[NEXT_GEP2]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP14]], align 2 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = mul nuw nsw [[TMP15]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP17]] = add [[TMP16]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP17]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[A]], [[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[GEP_A_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = phi ptr [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ], [ [[GEP_B_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A_NEXT]] = getelementptr inbounds nuw i16, ptr [[GEP_A]], i64 1 +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[GEP_B_NEXT]] = getelementptr inbounds nuw i16, ptr [[GEP_B]], i64 1 +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %gep.a = phi ptr [ %a, %entry ], [ %gep.a.next, %for.body ] + %gep.b = phi ptr [ %b, %entry ], [ %gep.b.next, %for.body ] + %gep.a.next = getelementptr inbounds nuw i16, ptr %gep.a, i64 1 + %load.a = load i16, ptr %gep.a, align 2 + %ext.a = zext i16 %load.a to i64 + %gep.b.next = getelementptr inbounds nuw i16, ptr %gep.b, i64 1 + %load.b = load i16, ptr %gep.b, align 2 + %ext.b = zext i16 %load.b to i64 + %mul = mul nuw nsw i64 %ext.b, %ext.a + %add = add nsw i64 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %add +} + define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -297,16 +713,16 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) ; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: -; CHECK-INTERLEAVE1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ] -; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: for.body: -; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] ; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 @@ -318,9 +734,9 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types( @@ -476,7 +892,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]] ; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) @@ -498,7 +914,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP142]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] @@ -585,7 +1001,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]]) ; CHECK-MAXBW-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] @@ -606,7 +1022,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-MAXBW: for.exit: ; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] @@ -669,7 +1085,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP16]], [[TMP17]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 8 @@ -698,7 +1114,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] @@ -749,7 +1165,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 8 @@ -778,7 +1194,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] @@ -818,7 +1234,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add [[TMP25]], [[TMP26]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 8 @@ -847,7 +1263,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-MAXBW: for.exit: ; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] @@ -909,7 +1325,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP17]] = add [[TMP16]], [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], 8 @@ -938,7 +1354,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] @@ -981,7 +1397,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add [[TMP30]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP23]], 8 @@ -1010,7 +1426,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] @@ -1049,7 +1465,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP21]] = add [[TMP20]], [[TMP19]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8 @@ -1078,7 +1494,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[EXT_B]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-MAXBW: for.exit: ; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] @@ -1173,7 +1589,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP41]] = add [[TMP40]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP41]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP35]]) @@ -1231,7 +1647,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVE1: exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP46]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ] @@ -1361,7 +1777,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP81]] = add [[TMP79]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]] ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP81]], [[TMP80]] ; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) @@ -1423,7 +1839,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVED: exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] @@ -1501,7 +1917,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE16]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32( [[VEC_PHI4]], [[TMP73]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE16]]) ; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[PARTIAL_REDUCE17]]) @@ -1559,7 +1975,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ADD_A3]] = add nsw i32 [[MUL_A3]], [[ACCUM3]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[NUM_IN]] -; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-MAXBW: exit: ; CHECK-MAXBW-NEXT: [[ADD_A0_LCSSA:%.*]] = phi i32 [ [[ADD_A0]], [[FOR_BODY]] ], [ [[TMP42]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: [[ADD_A1_LCSSA:%.*]] = phi i32 [ [[ADD_A1]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] @@ -1666,7 +2082,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -1687,7 +2103,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVE1: exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] @@ -1732,7 +2148,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -1753,7 +2169,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVED: exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] @@ -1798,7 +2214,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = extractelement [[TMP19]], i32 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -1819,7 +2235,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[ACCUM]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-MAXBW: exit: ; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] @@ -1877,7 +2293,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add [[TMP13]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1903,7 +2319,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVE1: for.exit: ; CHECK-INTERLEAVE1-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] @@ -1953,7 +2369,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP24]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) @@ -1980,7 +2396,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] @@ -2017,7 +2433,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP24]] = add [[TMP22]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP24]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -2043,7 +2459,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-MAXBW: for.exit: ; CHECK-MAXBW-NEXT: [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] @@ -2106,7 +2522,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add [[VEC_PHI]], [[TMP14]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP15]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] @@ -2128,7 +2544,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] ; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 -; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVE1: exit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]] @@ -2178,7 +2594,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP25]] = add [[VEC_PHI1]], [[TMP23]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP25]], [[TMP24]] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) @@ -2201,7 +2617,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVED: exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]] @@ -2238,7 +2654,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP19]] = add [[VEC_PHI]], [[TMP14]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP19]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]] @@ -2260,7 +2676,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[CONV3]], [[CONV]] ; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[SUM]], [[MUL]] ; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_IV_NEXT]], 16 -; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-MAXBW: exit: ; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]] @@ -2377,7 +2793,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2404,7 +2820,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[PTR]], i64 16 ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: for.exit: ; CHECK-INTERLEAVED-NEXT: [[ADD_1_LCSSA:%.*]] = phi i32 [ [[ADD_1]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: [[ADD_FLOAT:%.*]] = sitofp i32 [[ADD_1_LCSSA]] to float @@ -2521,7 +2937,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2540,7 +2956,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVE1: exit.loopexit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] @@ -2581,7 +2997,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2601,7 +3017,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK-INTERLEAVED: exit.loopexit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] @@ -2642,7 +3058,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2661,7 +3077,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-MAXBW: exit.loopexit: ; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: br label [[EXIT]] @@ -2722,7 +3138,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2741,7 +3157,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK-INTERLEAVE1: exit.loopexit: ; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] @@ -2782,7 +3198,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2802,7 +3218,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] ; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; CHECK-INTERLEAVED: exit.loopexit: ; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] @@ -2843,7 +3259,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[TMP11]] = add [[TMP10]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP11]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2862,7 +3278,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[MUL]], [[ACCUM]] ; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK-MAXBW: exit.loopexit: ; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-MAXBW-NEXT: br label [[EXIT]] @@ -2936,11 +3352,40 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add [[TMP17]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP18]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[COST]], [[FOR_BODY_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[A]], [[FOR_BODY_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[B]], [[FOR_BODY_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[A_PTR:%.*]] = phi ptr [ [[A_GEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[B_PTR:%.*]] = phi ptr [ [[B_GEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[A_LOAD:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[A_EXT:%.*]] = zext i8 [[A_LOAD]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[B_LOAD:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT:%.*]] = zext i8 [[B_LOAD]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[B_EXT]], [[A_EXT]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[A_GEP]] = getelementptr inbounds nuw i8, ptr [[A_PTR]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[B_GEP]] = getelementptr inbounds nuw i8, ptr [[B_PTR]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[CMP_2:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_2]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-INTERLEAVE1: exit.loopexit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVE1: exit: +; CHECK-INTERLEAVE1-NEXT: [[COST_RESULT:%.*]] = phi i64 [ [[COST]], [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = trunc i64 [[COST_RESULT]] to i32 +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define dso_local i32 @not_dotp_vscale1( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] { @@ -2995,18 +3440,47 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add [[TMP26]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP28]], [[TMP27]] ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; -; CHECK-MAXBW-LABEL: define dso_local i32 @not_dotp_vscale1( -; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] { -; CHECK-MAXBW-NEXT: entry: -; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 -; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP30]], [[MIDDLE_BLOCK]] ], [ [[COST]], [[FOR_BODY_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[A]], [[FOR_BODY_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL7:%.*]] = phi ptr [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[B]], [[FOR_BODY_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[A_PTR:%.*]] = phi ptr [ [[A_GEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[B_PTR:%.*]] = phi ptr [ [[B_GEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL7]], [[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[A_LOAD:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-INTERLEAVED-NEXT: [[A_EXT:%.*]] = zext i8 [[A_LOAD]] to i64 +; CHECK-INTERLEAVED-NEXT: [[B_LOAD:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT:%.*]] = zext i8 [[B_LOAD]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[B_EXT]], [[A_EXT]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[A_GEP]] = getelementptr inbounds nuw i8, ptr [[A_PTR]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[B_GEP]] = getelementptr inbounds nuw i8, ptr [[B_PTR]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[CMP_2:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_2]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-INTERLEAVED: exit.loopexit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label [[EXIT]] +; CHECK-INTERLEAVED: exit: +; CHECK-INTERLEAVED-NEXT: [[COST_RESULT:%.*]] = phi i64 [ [[COST]], [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = trunc i64 [[COST_RESULT]] to i32 +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] +; +; CHECK-MAXBW-LABEL: define dso_local i32 @not_dotp_vscale1( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]], i64 [[COST:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-MAXBW: for.body.preheader: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() @@ -3042,11 +3516,40 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW-NEXT: [[TMP20]] = add [[TMP17]], [[VEC_PHI]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP20]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[COST]], [[FOR_BODY_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[A]], [[FOR_BODY_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ [[B]], [[FOR_BODY_PREHEADER]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[A_PTR:%.*]] = phi ptr [ [[A_GEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[B_PTR:%.*]] = phi ptr [ [[B_GEP:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[A_LOAD:%.*]] = load i8, ptr [[A_PTR]], align 1 +; CHECK-MAXBW-NEXT: [[A_EXT:%.*]] = zext i8 [[A_LOAD]] to i64 +; CHECK-MAXBW-NEXT: [[B_LOAD:%.*]] = load i8, ptr [[B_PTR]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT:%.*]] = zext i8 [[B_LOAD]] to i64 +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[B_EXT]], [[A_EXT]] +; CHECK-MAXBW-NEXT: [[ADD]] = add nsw i64 [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[A_GEP]] = getelementptr inbounds nuw i8, ptr [[A_PTR]], i64 1 +; CHECK-MAXBW-NEXT: [[B_GEP]] = getelementptr inbounds nuw i8, ptr [[B_PTR]], i64 1 +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[CMP_2:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_2]], label [[EXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-MAXBW: exit.loopexit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label [[EXIT]] +; CHECK-MAXBW: exit: +; CHECK-MAXBW-NEXT: [[COST_RESULT:%.*]] = phi i64 [ [[COST]], [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = trunc i64 [[COST_RESULT]] to i32 +; CHECK-MAXBW-NEXT: ret i32 [[RESULT]] ; entry: %cmp = icmp sgt i32 %n, 0 @@ -3075,8 +3578,772 @@ exit: ; preds = %for.cond.cleanup.loopexit, %ent ret i32 %result } + +; == Partial reductions with add of an extend + +define i32 @zext_add_reduc_i8_i32(ptr %a) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @zext_add_reduc_i8_i32( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @zext_add_reduc_i8_i32( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i32 @zext_add_reduc_i8_i32( +; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP12]] = add [[TMP9]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP12]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[EXT_A]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %add = add i32 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + + +define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @zext_add_reduc_i8_i64( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i64> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @zext_add_reduc_i8_i64( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i64> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i64> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i64> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i64 @zext_add_reduc_i8_i64( +; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP12]] = add [[TMP9]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP12]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i64 + %add = add i64 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %add +} + + +define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { +; CHECK-INTERLEAVE1-LABEL: define i64 @zext_add_reduc_i16_i64( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <8 x i64> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @zext_add_reduc_i16_i64( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP1]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <8 x i64> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i16, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i64 @zext_add_reduc_i16_i64( +; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 2 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP12]] = add [[TMP9]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP12]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i16, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i16, ptr [[GEP_A]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i16 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i16, ptr %a, i64 %iv + %load.a = load i16, ptr %gep.a, align 2 + %ext.a = zext i16 %load.a to i64 + %add = add i64 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %add +} + + +define i64 @zext_add_reduc_i8_i64_has_neon_dotprod(ptr %a) #1 { +; CHECK-INTERLEAVE1-LABEL: define i64 @zext_add_reduc_i8_i64_has_neon_dotprod( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i64> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i64 @zext_add_reduc_i8_i64_has_neon_dotprod( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i64> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i64> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i64> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i64 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i64 @zext_add_reduc_i8_i64_has_neon_dotprod( +; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR1]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP10]] = add [[TMP9]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64( [[TMP10]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i64 +; CHECK-MAXBW-NEXT: [[ADD]] = add i64 [[EXT_A]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i64 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i64 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i64 + %add = add i64 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i64 %add +} + + +define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { +; CHECK-INTERLEAVE1-LABEL: define i32 @sext_add_reduc_i8_i32( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVE1: for.body: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = add i32 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @sext_add_reduc_i8_i32( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br i1 false, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-INTERLEAVED: for.body: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-INTERLEAVED-NEXT: [[ADD]] = add i32 [[EXT_A]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret i32 [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define i32 @sext_add_reduc_i8_i32( +; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP12]] = add [[TMP9]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP12]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-MAXBW: for.body: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = sext i8 [[LOAD_A]] to i32 +; CHECK-MAXBW-NEXT: [[ADD]] = add i32 [[EXT_A]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = sext i8 %load.a to i32 + %add = add i32 %ext.a, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1025 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + + !7 = distinct !{!7, !8, !9, !10} !8 = !{!"llvm.loop.mustprogress"} !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} !10 = !{!"llvm.loop.vectorize.enable", i1 true} attributes #0 = { vscale_range(1,16) "target-features"="+sve" } +attributes #1 = { vscale_range(1,16) "target-features"="+neon,+dotprod,+sve" "cpu"="neoverse-v2" } From 83af335ea47b50037beb46e5d6fb04be89f3b207 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 10 Feb 2025 16:07:57 +0000 Subject: [PATCH 167/293] [llvm][lit] Update regexes in Xunit test (#126527) I got a report that downstream this test failed and the cause was that it took longer than the 1 second we expected to run one of the test cases. This test doesn't need to be that specific, so I am updating all the time regexes to be the same one that allows 0-9 any number of digits, requires a decimal point, then 0-9 any number of digits for the final part. --- llvm/utils/lit/tests/xunit-output.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/utils/lit/tests/xunit-output.py b/llvm/utils/lit/tests/xunit-output.py index 392cded4653fe..c6cf3dfc24c81 100644 --- a/llvm/utils/lit/tests/xunit-output.py +++ b/llvm/utils/lit/tests/xunit-output.py @@ -8,19 +8,19 @@ # RUN: FileCheck < %t.xunit.xml %s # CHECK: -# CHECK-NEXT: -# CHECK-NEXT: -# CHECK-NEXT: +# CHECK-NEXT: +# CHECK-NEXT: +# CHECK-NEXT: # CHECK-NEXT: ]]]]> &"]]> # CHECK-NEXT: -# CHECK-NEXT: +# CHECK-NEXT: # CHECK-NEXT: # CHECK-NEXT: -# CHECK-NEXT: +# CHECK-NEXT: # CHECK-NEXT: # CHECK-NEXT: -# CHECK-NEXT: -# CHECK-NEXT: +# CHECK-NEXT: +# CHECK-NEXT: # CHECK-NEXT: # CHECK-NEXT: # CHECK-NEXT: From c69be3fe4bec916c111eec4eec1def04b16fba8d Mon Sep 17 00:00:00 2001 From: Amit Kumar Pandey <137622562+ampandey-1995@users.noreply.github.com> Date: Mon, 10 Feb 2025 21:41:49 +0530 Subject: [PATCH 168/293] [Driver][ROCm][OpenMP] Fix default ockl linking for OpenMP. (#126186) ASan gpu runtime (asanrtl.bc) linking is dependent on 'ockl.bc'. Link 'ockl.bc' only when ASan is enabled for openmp amdgpu offloading application. --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 15 +++-- clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp | 2 +- .../Driver/amdgpu-openmp-sanitize-options.c | 55 +++++++++---------- clang/test/Driver/hip-sanitize-options.hip | 2 +- 4 files changed, 37 insertions(+), 37 deletions(-) diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index e66e5a32e58ac..202198e96c012 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -1014,7 +1014,12 @@ RocmInstallationDetector::getCommonBitcodeLibs( bool isOpenMP = false) const { llvm::SmallVector BCLibs; - auto GPUSanEnabled = [GPUSan]() { return std::get(GPUSan); }; + // GPU Sanitizer currently only supports ASan and is enabled through host + // ASan. + auto GPUSanEnabled = [GPUSan]() { + return std::get(GPUSan) && + std::get(GPUSan).needsAsanRt(); + }; auto AddBCLib = [&](ToolChain::BitCodeLibraryInfo BCLib, bool Internalize = true) { BCLib.ShouldInternalize = Internalize; @@ -1022,9 +1027,7 @@ RocmInstallationDetector::getCommonBitcodeLibs( }; auto AddSanBCLibs = [&]() { if (GPUSanEnabled()) { - auto SanArgs = std::get(GPUSan); - if (SanArgs.needsAsanRt()) - AddBCLib(getAsanRTLPath(), false); + AddBCLib(getAsanRTLPath(), false); } }; @@ -1066,7 +1069,7 @@ ROCMToolChain::getCommonDeviceLibNames(const llvm::opt::ArgList &DriverArgs, // them all? std::tuple GPUSan( DriverArgs.hasFlag(options::OPT_fgpu_sanitize, - options::OPT_fno_gpu_sanitize, false), + options::OPT_fno_gpu_sanitize, true), getSanitizerArgs(DriverArgs)); bool DAZ = DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero, options::OPT_fno_gpu_flush_denormals_to_zero, @@ -1099,7 +1102,7 @@ bool AMDGPUToolChain::shouldSkipSanitizeOption( return false; if (!DriverArgs.hasFlag(options::OPT_fgpu_sanitize, - options::OPT_fno_gpu_sanitize, false)) + options::OPT_fno_gpu_sanitize, true)) return true; auto &Diags = TC.getDriver().getDiags(); diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp index 00bf9c7338edd..aba79f5fa6fa7 100644 --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -68,7 +68,7 @@ llvm::opt::DerivedArgList *AMDGPUOpenMPToolChain::TranslateArgs( Action::OffloadKind DeviceOffloadKind) const { DerivedArgList *DAL = HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind); - if (!DAL) + if (!DAL || Args.hasArg(options::OPT_fsanitize_EQ)) DAL = new DerivedArgList(Args.getBaseArgs()); const OptTable &Opts = getDriver().getOpts(); diff --git a/clang/test/Driver/amdgpu-openmp-sanitize-options.c b/clang/test/Driver/amdgpu-openmp-sanitize-options.c index c28a758bfc0c5..3fb8641527666 100644 --- a/clang/test/Driver/amdgpu-openmp-sanitize-options.c +++ b/clang/test/Driver/amdgpu-openmp-sanitize-options.c @@ -1,5 +1,3 @@ -// REQUIRES: x86-registered-target, amdgpu-registered-target - // Fail on invalid ROCm Path. // RUN: not %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fgpu-sanitize -nogpuinc --rocm-path=%S/Inputs/rocm-invalid %s 2>&1 \ // RUN: | FileCheck --check-prefix=FAIL %s @@ -13,38 +11,40 @@ // RUN: | FileCheck --check-prefix=NOTSUPPORTED %s // GPU ASan Enabled Test Cases -// ASan enabled for amdgpu-arch [gfx908] -// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908 -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOXNACK,GPUSAN %s - -// GPU ASan enabled for amdgpu-arch [gfx908:xnack-] -// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack- -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=XNACKNEG,GPUSAN %s // GPU ASan enabled for amdgpu-arch [gfx908:xnack+] // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=GPUSAN %s +// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s + +// GPU ASan enabled through '-fsanitize=address' flag without '-fgpu-sanitize' for amdgpu-arch [gfx908:xnack+] +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s // ASan enabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+] // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=GPUSAN %s +// RUN: | FileCheck -check-prefixes=HOSTSAN,GPUSAN,SAN %s // GPU ASan Disabled Test Cases -// ASan disabled for amdgpu-arch [gfx908] -// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908 -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOGPUSAN %s + +// GPU ASan disabled through '-fsanitize=address' without '-fgpu-sanitize' flag for amdgpu-arch [gfx908] +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908 -fsanitize=address --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=NOXNACK,HOSTSAN,NOGPUSAN,SAN %s + +// GPU ASan disabled for amdgpu-arch [gfx908] +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908 -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=NOXNACK,HOSTSAN,NOGPUSAN,SAN %s // GPU ASan disabled for amdgpu-arch [gfx908:xnack-] -// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack- -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOGPUSAN %s +// RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack- -fsanitize=address -fgpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=XNACKNEG,HOSTSAN,NOGPUSAN,SAN %s -// GPU ASan disabled for amdgpu-arch [gfx908:xnack+] +// GPU ASan disabled using '-fno-gpu-sanitize' for amdgpu-arch [gfx908:xnack+] // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOGPUSAN %s +// RUN: | FileCheck -check-prefixes=HOSTSAN,NOGPUSAN,SAN %s -// ASan disabled for amdgpu-arch [gfx908:xnack+,gfx900:xnack+] +// GPU ASan disabled for multiple amdgpu-arch [gfx908:xnack+,gfx900:xnack+] // RUN: %clang -no-canonical-prefixes -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp --offload-arch=gfx908:xnack+ --offload-arch=gfx900:xnack+ -fsanitize=address -fno-gpu-sanitize --rocm-path=%S/Inputs/rocm %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=NOGPUSAN %s +// RUN: | FileCheck -check-prefixes=HOSTSAN,NOGPUSAN,SAN %s // FAIL-DAG: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library // NOTSUPPORTED-DAG: warning: ignoring '-fsanitize=leak' option as it is not currently supported for target 'amdgcn-amd-amdhsa' @@ -52,14 +52,11 @@ // NOXNACK: warning: ignoring '-fsanitize=address' option for offload arch 'gfx908' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead // XNACKNEG: warning: ignoring '-fsanitize=address' option for offload arch 'gfx908:xnack-' as it is not currently supported there. Use it with an offload arch containing 'xnack+' instead -// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} -// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}} -// GPUSAN: {{"[^"]*clang-offload-packager[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}} -// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}} -// GPUSAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}} +// HOSTSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} -// NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "c".*}} +// GPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-mlink-bitcode-file" "[^"]*asanrtl.bc".* "-mlink-bitcode-file" "[^"]*ockl.bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-fsanitize=address".* "-x" "c".*}} // NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu".* "-emit-llvm-bc".* "-target-cpu" "(gfx908|gfx900)".* "-fopenmp".* "-x" "c".*}} -// NOGPUSAN: {{"[^"]*clang-offload-packager[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}} -// NOGPUSAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}} -// NOGPUSAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}} + +// SAN: {{"[^"]*clang-offload-packager[^"]*" "-o".* "--image=file=.*.bc,triple=amdgcn-amd-amdhsa,arch=gfx908(:xnack\-|:xnack\+)?,kind=openmp(,feature=(\-xnack|\+xnack))?"}} +// SAN: {{"[^"]*clang[^"]*" "-cc1" "-triple" "x86_64-unknown-linux-gnu".* "-fopenmp".* "-fsanitize=address".* "-fopenmp-targets=amdgcn-amd-amdhsa".* "-x" "ir".*}} +// SAN: {{"[^"]*clang-linker-wrapper[^"]*".* "--host-triple=x86_64-unknown-linux-gnu".* "--linker-path=[^"]*".* "--whole-archive" "[^"]*(libclang_rt.asan_static.a|libclang_rt.asan_static-x86_64.a)".* "--whole-archive" "[^"]*(libclang_rt.asan.a|libclang_rt.asan-x86_64.a)".*}} \ No newline at end of file diff --git a/clang/test/Driver/hip-sanitize-options.hip b/clang/test/Driver/hip-sanitize-options.hip index 8a852867f5b3b..8de0ee9e18426 100644 --- a/clang/test/Driver/hip-sanitize-options.hip +++ b/clang/test/Driver/hip-sanitize-options.hip @@ -1,5 +1,5 @@ // RUN: %clang -### --target=x86_64-unknown-linux-gnu --offload-arch=gfx900:xnack+ \ -// RUN: -fsanitize=address -fgpu-sanitize \ +// RUN: -fsanitize=address \ // RUN: -nogpuinc --rocm-path=%S/Inputs/rocm \ // RUN: %s 2>&1 | FileCheck -check-prefixes=NORDC %s From 71fcc825b4e271b7608b54de27ae69fe70f00fad Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 10 Feb 2025 11:12:54 -0500 Subject: [PATCH 169/293] [NFC][StructurizeCFG] Add a test that can crash StructurizeCFG pass (#126087) I tried to fix it in #124051 but failed to do so. This PR adds the test and marks it as xfail. --- .../simple-structurizecfg-crash.ll | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 llvm/test/Transforms/StructurizeCFG/simple-structurizecfg-crash.ll diff --git a/llvm/test/Transforms/StructurizeCFG/simple-structurizecfg-crash.ll b/llvm/test/Transforms/StructurizeCFG/simple-structurizecfg-crash.ll new file mode 100644 index 0000000000000..c78985b12619b --- /dev/null +++ b/llvm/test/Transforms/StructurizeCFG/simple-structurizecfg-crash.ll @@ -0,0 +1,19 @@ +; RUN: opt -S -passes=structurizecfg %s -o - +; XFAIL: * + +; Issue tracking: https://github.com/llvm/llvm-project/issues/126534. +; FIXME: This test is expected to crash. Generate checklines after the crash is fixed. + +define void @foo() { +entry: + br i1 false, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + ret void +} From 8380b5c7494e5511dfdc944108ff316453a36061 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 10 Feb 2025 08:16:12 -0800 Subject: [PATCH 170/293] [TableGen][InstrInfo] Cull mapping that have not been enabled/not needed (#126137) - Detect whether logical operand mapping/named operand mappings have been enabled in a previous pass over instructions and execute the relevant emission code only if those mappings are enabled. - For these mappings, skip the fixed set of predefined instructions as they won't have these mappings enabled. - Emit operand type mappings only for X86 target, as they are only used by X86 and look for X86 specific `X86MemOperand`. - Cleanup `emitOperandTypeMappings` code: remove code to handle empty instruction list and use range for loops. --- .../TableGen/get-operand-type-no-expand.td | 4 +- llvm/test/TableGen/get-operand-type.td | 14 +- llvm/utils/TableGen/InstrInfoEmitter.cpp | 173 ++++++++++-------- 3 files changed, 107 insertions(+), 84 deletions(-) diff --git a/llvm/test/TableGen/get-operand-type-no-expand.td b/llvm/test/TableGen/get-operand-type-no-expand.td index 9dfcbfaec76af..a0a8fa957f9b6 100644 --- a/llvm/test/TableGen/get-operand-type-no-expand.td +++ b/llvm/test/TableGen/get-operand-type-no-expand.td @@ -5,7 +5,7 @@ include "llvm/Target/Target.td" def archInstrInfo : InstrInfo { } -def arch : Target { +def X86 : Target { let InstructionSet = archInstrInfo; } @@ -26,7 +26,7 @@ def InstA : Instruction { let InOperandList = (ins i8complex:$b, i32imm:$c); field bits<8> Inst; field bits<8> SoftFail = 0; - let Namespace = "MyNamespace"; + let Namespace = "X86"; } // RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s \ diff --git a/llvm/test/TableGen/get-operand-type.td b/llvm/test/TableGen/get-operand-type.td index 6ebda5cffe8af..b2f63cafd6a89 100644 --- a/llvm/test/TableGen/get-operand-type.td +++ b/llvm/test/TableGen/get-operand-type.td @@ -1,12 +1,12 @@ // RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s | FileCheck %s -// Check that getOperandType has the expected info in it +// Check that getOperandType has the expected info in it. include "llvm/Target/Target.td" def archInstrInfo : InstrInfo { } -def arch : Target { +def X86 : Target { let InstructionSet = archInstrInfo; } @@ -24,7 +24,7 @@ def InstA : Instruction { let InOperandList = (ins OpB:$b, i32imm:$c); field bits<8> Inst; field bits<8> SoftFail = 0; - let Namespace = "MyNamespace"; + let Namespace = "X86"; } def InstB : Instruction { @@ -33,7 +33,7 @@ def InstB : Instruction { let InOperandList = (ins unknown:$x); field bits<8> Inst; field bits<8> SoftFail = 0; - let Namespace = "MyNamespace"; + let Namespace = "X86"; } def InstC : Instruction { @@ -42,12 +42,12 @@ def InstC : Instruction { let InOperandList = (ins RegOp:$x); field bits<8> Inst; field bits<8> SoftFail = 0; - let Namespace = "MyNamespace"; + let Namespace = "X86"; } // CHECK: #ifdef GET_INSTRINFO_OPERAND_TYPE -// CHECK: static const uint{{.*}}_t Offsets[] = { -// CHECK: static const {{.*}} OpcodeOperandTypes[] = { +// CHECK: static constexpr uint{{.*}}_t Offsets[] = { +// CHECK: static constexpr {{.*}} OpcodeOperandTypes[] = { // CHECK: /* InstA */ // CHECK-NEXT: OpA, OpB, i32imm, // CHECK-NEXT: /* InstB */ diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 7c46890a49c81..14ef4e725493b 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -261,7 +261,11 @@ void InstrInfoEmitter::emitOperandNameMappings( // Max operand index seen. unsigned MaxOperandNo = 0; - for (const CodeGenInstruction *Inst : NumberedInstructions) { + // Fixed/Predefined instructions do not have UseNamedOperandTable enabled, so + // we can just skip them. + const unsigned NumFixedInsts = Target.getNumFixedInstructions(); + for (const CodeGenInstruction *Inst : + NumberedInstructions.drop_front(NumFixedInsts)) { if (!Inst->TheDef->getValueAsBit("UseNamedOperandTable")) continue; std::map OpList; @@ -335,11 +339,18 @@ void InstrInfoEmitter::emitOperandNameMappings( /// Generate an enum for all the operand types for this target, under the /// llvm::TargetNamespace::OpTypes namespace. /// Operand types are all definitions derived of the Operand Target.td class. +/// void InstrInfoEmitter::emitOperandTypeMappings( raw_ostream &OS, const CodeGenTarget &Target, ArrayRef NumberedInstructions) { - StringRef Namespace = Target.getInstNamespace(); + + // These generated functions are used only by the X86 target + // (in bolt/lib/Target/X86/X86MCPlusBuilder.cpp). So emit them only + // for X86. + if (Namespace != "X86") + return; + ArrayRef Operands = Records.getAllDerivedDefinitions("Operand"); ArrayRef RegisterOperands = @@ -376,73 +387,66 @@ void InstrInfoEmitter::emitOperandTypeMappings( return NumberedInstructions[I]->TheDef->getName(); }; // TODO: Factor out duplicate operand lists to compress the tables. - if (!NumberedInstructions.empty()) { - std::vector OperandOffsets; - std::vector OperandRecords; - int CurrentOffset = 0; - for (const CodeGenInstruction *Inst : NumberedInstructions) { - OperandOffsets.push_back(CurrentOffset); - for (const auto &Op : Inst->Operands) { - const DagInit *MIOI = Op.MIOperandInfo; - if (!ExpandMIOperandInfo || !MIOI || MIOI->getNumArgs() == 0) { - // Single, anonymous, operand. - OperandRecords.push_back(Op.Rec); + std::vector OperandOffsets; + std::vector OperandRecords; + size_t CurrentOffset = 0; + for (const CodeGenInstruction *Inst : NumberedInstructions) { + OperandOffsets.push_back(CurrentOffset); + for (const auto &Op : Inst->Operands) { + const DagInit *MIOI = Op.MIOperandInfo; + if (!ExpandMIOperandInfo || !MIOI || MIOI->getNumArgs() == 0) { + // Single, anonymous, operand. + OperandRecords.push_back(Op.Rec); + ++CurrentOffset; + } else { + for (const Init *Arg : MIOI->getArgs()) { + OperandRecords.push_back(cast(Arg)->getDef()); ++CurrentOffset; - } else { - for (const Init *Arg : MIOI->getArgs()) { - OperandRecords.push_back(cast(Arg)->getDef()); - ++CurrentOffset; - } } } } + } - // Emit the table of offsets (indexes) into the operand type table. - // Size the unsigned integer offset to save space. - assert(OperandRecords.size() <= UINT32_MAX && - "Too many operands for offset table"); - OS << " static const " << getMinimalTypeForRange(OperandRecords.size()); - OS << " Offsets[] = {\n"; - for (int I = 0, E = OperandOffsets.size(); I != E; ++I) { - OS << " /* " << getInstrName(I) << " */\n"; - OS << " " << OperandOffsets[I] << ",\n"; - } - OS << " };\n"; + // Emit the table of offsets (indexes) into the operand type table. + // Size the unsigned integer offset to save space. + assert(OperandRecords.size() <= UINT32_MAX && + "Too many operands for offset table"); + OS << " static constexpr " << getMinimalTypeForRange(OperandRecords.size()); + OS << " Offsets[] = {\n"; + for (const auto &[Idx, Offset] : enumerate(OperandOffsets)) + OS << " " << Offset << ", // " << getInstrName(Idx) << '\n'; + OS << " };\n"; - // Add an entry for the end so that we don't need to special case it below. - OperandOffsets.push_back(OperandRecords.size()); - - // Emit the actual operand types in a flat table. - // Size the signed integer operand type to save space. - assert(EnumVal <= INT16_MAX && - "Too many operand types for operand types table"); - OS << "\n using namespace OpTypes;\n"; - OS << " static"; - OS << ((EnumVal <= INT8_MAX) ? " const int8_t" : " const int16_t"); - OS << " OpcodeOperandTypes[] = {\n "; - for (int I = 0, E = OperandRecords.size(), CurOffset = 0; I != E; ++I) { - // We print each Opcode's operands in its own row. - if (I == OperandOffsets[CurOffset]) { - OS << "\n /* " << getInstrName(CurOffset) << " */\n "; - while (OperandOffsets[++CurOffset] == I) - OS << "/* " << getInstrName(CurOffset) << " */\n "; - } - const Record *OpR = OperandRecords[I]; - if ((OpR->isSubClassOf("Operand") || - OpR->isSubClassOf("RegisterOperand") || - OpR->isSubClassOf("RegisterClass")) && - !OpR->isAnonymous()) - OS << OpR->getName(); - else - OS << -1; - OS << ", "; + // Add an entry for the end so that we don't need to special case it below. + OperandOffsets.push_back(OperandRecords.size()); + + // Emit the actual operand types in a flat table. + // Size the signed integer operand type to save space. + assert(EnumVal <= INT16_MAX && + "Too many operand types for operand types table"); + OS << "\n using namespace OpTypes;\n"; + OS << " static"; + OS << (EnumVal <= INT8_MAX ? " constexpr int8_t" : " constexpr int16_t"); + OS << " OpcodeOperandTypes[] = {"; + size_t CurOffset = 0; + for (auto [Idx, OpR] : enumerate(OperandRecords)) { + // We print each Opcode's operands in its own row. + if (Idx == OperandOffsets[CurOffset]) { + OS << "\n /* " << getInstrName(CurOffset) << " */\n "; + while (OperandOffsets[++CurOffset] == Idx) + OS << "/* " << getInstrName(CurOffset) << " */\n "; } - OS << "\n };\n"; - - OS << " return OpcodeOperandTypes[Offsets[Opcode] + OpIdx];\n"; - } else { - OS << " llvm_unreachable(\"No instructions defined\");\n"; + if ((OpR->isSubClassOf("Operand") || OpR->isSubClassOf("RegisterOperand") || + OpR->isSubClassOf("RegisterClass")) && + !OpR->isAnonymous()) + OS << OpR->getName(); + else + OS << -1; + OS << ", "; } + OS << "\n };\n"; + + OS << " return OpcodeOperandTypes[Offsets[Opcode] + OpIdx];\n"; OS << "}\n"; OS << "} // end namespace llvm::" << Namespace << "\n"; OS << "#endif // GET_INSTRINFO_OPERAND_TYPE\n\n"; @@ -461,10 +465,10 @@ void InstrInfoEmitter::emitOperandTypeMappings( SizeToOperandName[Size].push_back(Op->getName()); } OS << " default: return 0;\n"; - for (const auto &KV : SizeToOperandName) { - for (const StringRef &OperandName : KV.second) + for (const auto &[Size, OperandNames] : SizeToOperandName) { + for (const StringRef &OperandName : OperandNames) OS << " case OpTypes::" << OperandName << ":\n"; - OS << " return " << KV.first << ";\n\n"; + OS << " return " << Size << ";\n\n"; } OS << " }\n}\n"; OS << "} // end namespace llvm::" << Namespace << "\n"; @@ -475,12 +479,15 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings( raw_ostream &OS, StringRef Namespace, ArrayRef NumberedInstructions) { std::map, unsigned> LogicalOpSizeMap; - std::map> InstMap; size_t LogicalOpListSize = 0U; std::vector LogicalOpList; - for (const auto *Inst : NumberedInstructions) { + + // Fixed/Predefined instructions do not have UseLogicalOperandMappings + // enabled, so we can just skip them. + const unsigned NumFixedInsts = CDP.getTargetInfo().getNumFixedInstructions(); + for (const auto *Inst : NumberedInstructions.drop_front(NumFixedInsts)) { if (!Inst->TheDef->getValueAsBit("UseLogicalOperandMappings")) continue; @@ -907,22 +914,34 @@ void InstrInfoEmitter::run(raw_ostream &OS) { unsigned OperandInfoSize = CollectOperandInfo(OperandInfoList, OperandInfoMap); + ArrayRef NumberedInstructions = + Target.getInstructionsByEnumValue(); + // Collect all of the instruction's implicit uses and defs. + // Also collect which features are enabled by instructions to control + // emission of various mappings. + + bool HasUseLogicalOperandMappings = false; + bool HasUseNamedOperandTable = false; + Timer.startTimer("Collect uses/defs"); std::map, unsigned> EmittedLists; std::vector> ImplicitLists; unsigned ImplicitListSize = 0; - for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) { - std::vector ImplicitOps = II->ImplicitUses; - llvm::append_range(ImplicitOps, II->ImplicitDefs); + for (const CodeGenInstruction *Inst : NumberedInstructions) { + HasUseLogicalOperandMappings |= + Inst->TheDef->getValueAsBit("UseLogicalOperandMappings"); + HasUseNamedOperandTable |= + Inst->TheDef->getValueAsBit("UseNamedOperandTable"); + + std::vector ImplicitOps = Inst->ImplicitUses; + llvm::append_range(ImplicitOps, Inst->ImplicitDefs); if (EmittedLists.insert({ImplicitOps, ImplicitListSize}).second) { ImplicitLists.push_back(ImplicitOps); ImplicitListSize += ImplicitOps.size(); } } - ArrayRef NumberedInstructions = - Target.getInstructionsByEnumValue(); OS << "#if defined(GET_INSTRINFO_MC_DESC) || " "defined(GET_INSTRINFO_CTOR_DTOR)\n"; OS << "namespace llvm {\n\n"; @@ -1123,14 +1142,18 @@ void InstrInfoEmitter::run(raw_ostream &OS) { OS << "#endif // GET_INSTRINFO_CTOR_DTOR\n\n"; - Timer.startTimer("Emit operand name mappings"); - emitOperandNameMappings(OS, Target, NumberedInstructions); + if (HasUseNamedOperandTable) { + Timer.startTimer("Emit operand name mappings"); + emitOperandNameMappings(OS, Target, NumberedInstructions); + } Timer.startTimer("Emit operand type mappings"); emitOperandTypeMappings(OS, Target, NumberedInstructions); - Timer.startTimer("Emit logical operand size mappings"); - emitLogicalOperandSizeMappings(OS, TargetName, NumberedInstructions); + if (HasUseLogicalOperandMappings) { + Timer.startTimer("Emit logical operand size mappings"); + emitLogicalOperandSizeMappings(OS, TargetName, NumberedInstructions); + } Timer.startTimer("Emit logical operand type mappings"); emitLogicalOperandTypeMappings(OS, TargetName, NumberedInstructions); From f3cd2238383f695c719e7eab6aebec828781ec91 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Tue, 11 Feb 2025 01:16:40 +0900 Subject: [PATCH 171/293] [OpenMP][OpenMPIRBuilder] Add initial changes for SPIR-V target frontend support (#125920) As Intel is working to add support for SPIR-V OpenMP device offloading in upstream clang/liboffload, we need to modify the OpenMP frontend to allow SPIR-V as well as generate valid IR for SPIR-V. For example, we need the frontend to generate code to define and interact with device globals used in the DeviceRTL. This is the beginning of what I expect will be (many) other changes, but let's get started with something simple. --------- Signed-off-by: Sarnie, Nick --- clang/include/clang/Basic/TargetInfo.h | 2 +- clang/lib/CodeGen/CodeGenModule.cpp | 6 ++++-- .../test/OpenMP/spirv_target_codegen_basic.cpp | 17 +++++++++++++++++ .../llvm/Frontend/OpenMP/OMPGridValues.h | 11 +++++++++++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 4 ++++ 5 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 clang/test/OpenMP/spirv_target_codegen_basic.cpp diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index b9e46a5e7d1ca..070cc792ca7db 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -1662,7 +1662,7 @@ class TargetInfo : public TransferrableTargetInfo, // access target-specific GPU grid values that must be consistent between // host RTL (plugin), deviceRTL and clang. virtual const llvm::omp::GV &getGridValue() const { - llvm_unreachable("getGridValue not implemented on this target"); + return llvm::omp::SPIRVGridValues; } /// Retrieve the name of the platform as it is used in the diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index c056d103a7fe4..7924c32fcf633 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -486,8 +486,10 @@ void CodeGenModule::createOpenMPRuntime() { case llvm::Triple::nvptx: case llvm::Triple::nvptx64: case llvm::Triple::amdgcn: - assert(getLangOpts().OpenMPIsTargetDevice && - "OpenMP AMDGPU/NVPTX is only prepared to deal with device code."); + case llvm::Triple::spirv64: + assert( + getLangOpts().OpenMPIsTargetDevice && + "OpenMP AMDGPU/NVPTX/SPIRV is only prepared to deal with device code."); OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this)); break; default: diff --git a/clang/test/OpenMP/spirv_target_codegen_basic.cpp b/clang/test/OpenMP/spirv_target_codegen_basic.cpp new file mode 100644 index 0000000000000..fb2810e88c063 --- /dev/null +++ b/clang/test/OpenMP/spirv_target_codegen_basic.cpp @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-host.bc -o - | FileCheck %s + +// expected-no-diagnostics + +// CHECK: @__omp_offloading_{{.*}}_dynamic_environment = weak_odr protected addrspace(1) global %struct.DynamicEnvironmentTy zeroinitializer +// CHECK: @__omp_offloading_{{.*}}_kernel_environment = weak_odr protected addrspace(1) constant %struct.KernelEnvironmentTy + +// CHECK: define weak_odr protected spir_kernel void @__omp_offloading_{{.*}} + +int main() { + int ret = 0; + #pragma omp target + for(int i = 0; i < 5; i++) + ret++; + return ret; +} diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h index bfac2d734b81d..788a3c8a56f38 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -120,6 +120,17 @@ static constexpr GV NVPTXGridValues = { 128, // GV_Default_WG_Size }; +/// For generic SPIR-V GPUs +static constexpr GV SPIRVGridValues = { + 256, // GV_Slot_Size + 64, // GV_Warp_Size + (1 << 16), // GV_Max_Teams + 440, // GV_Default_Num_Teams + 896, // GV_SimpleBufferSize + 1024, // GV_Max_WG_Size, + 256, // GV_Default_WG_Size +}; + } // namespace omp } // namespace llvm diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 91fc16e54c88f..f30eb64f1b4c9 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -159,6 +159,8 @@ static const omp::GV &getGridValue(const Triple &T, Function *Kernel) { } if (T.isNVPTX()) return omp::NVPTXGridValues; + if (T.isSPIRV()) + return omp::SPIRVGridValues; llvm_unreachable("No grid value available for this architecture!"); } @@ -6472,6 +6474,8 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes( OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL); else if (T.isNVPTX()) OutlinedFn->setCallingConv(CallingConv::PTX_Kernel); + else if (T.isSPIRV()) + OutlinedFn->setCallingConv(CallingConv::SPIR_KERNEL); } } From 6b52fb25b90e575b507343bde0162d3d652ff666 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Mon, 10 Feb 2025 08:21:22 -0800 Subject: [PATCH 172/293] [flang] Correctly handle `!dir$ unroll` with unrolling factors of 0 and 1 (#126170) https://github.com/llvm/llvm-project/pull/123331 added support for the unrolling directive. In the presence of an explicit unrolling factor, that unrolling factor would be unconditionally passed into the metadata even when it was 1 or 0. These special cases should instead disable unrolling. Adding an explicit unrolling factor of 0 triggered this assertion which is fixed by this patch: ``` unsigned int unrollCountPragmaValue(const llvm::Loop*): Assertion `Count >= 1 && "Unroll count must be positive."' failed. ``` Updated tests and documentation. --- flang/docs/Directives.md | 9 +++++- flang/lib/Lower/Bridge.cpp | 47 +++++++++++++++++++++++-------- flang/test/Integration/unroll.f90 | 45 +++++++++++++++++++++++++---- 3 files changed, 82 insertions(+), 19 deletions(-) diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md index f356f762b13a2..c6c2e29a420ea 100644 --- a/flang/docs/Directives.md +++ b/flang/docs/Directives.md @@ -39,15 +39,22 @@ A list of non-standard directives supported by Flang * `!dir$ vector always` forces vectorization on the following loop regardless of cost model decisions. The loop must still be vectorizable. [This directive currently only works on plain do loops without labels]. +* `!dir$ unroll [n]` specifies that the compiler ought to unroll the immediately + following loop `n` times. When `n` is `0` or `1`, the loop should not be unrolled + at all. When `n` is `2` or greater, the loop should be unrolled exactly `n` + times if possible. When `n` is omitted, the compiler should attempt to fully + unroll the loop. Some compilers accept an optional `=` before the `n` when `n` + is present in the directive. Flang does not. # Directive Details ## Introduction Directives are commonly used in Fortran programs to specify additional actions to be performed by the compiler. The directives are always specified with the -`!dir$` or `cdir$` prefix. +`!dir$` or `cdir$` prefix. ## Loop Directives + Some directives are associated with the following construct, for example loop directives. Directives on loops are used to specify additional transformation to be performed by the compiler like enabling vectorisation, unrolling, interchange diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index a31629b17cf29..36e58e456dea3 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -63,6 +63,7 @@ #include "flang/Semantics/tools.h" #include "flang/Support/Version.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Parser/Parser.h" @@ -2170,11 +2171,38 @@ class FirConverter : public Fortran::lower::AbstractConverter { return builder->createIntegerConstant(loc, controlType, 1); // step } + // For unroll directives without a value, force full unrolling. + // For unroll directives with a value, if the value is greater than 1, + // force unrolling with the given factor. Otherwise, disable unrolling. + mlir::LLVM::LoopUnrollAttr + genLoopUnrollAttr(std::optional directiveArg) { + mlir::BoolAttr falseAttr = + mlir::BoolAttr::get(builder->getContext(), false); + mlir::BoolAttr trueAttr = mlir::BoolAttr::get(builder->getContext(), true); + mlir::IntegerAttr countAttr; + mlir::BoolAttr fullUnrollAttr; + bool shouldUnroll = true; + if (directiveArg.has_value()) { + auto unrollingFactor = directiveArg.value(); + if (unrollingFactor == 0 || unrollingFactor == 1) { + shouldUnroll = false; + } else { + countAttr = + builder->getIntegerAttr(builder->getI64Type(), unrollingFactor); + } + } else { + fullUnrollAttr = trueAttr; + } + + mlir::BoolAttr disableAttr = shouldUnroll ? falseAttr : trueAttr; + return mlir::LLVM::LoopUnrollAttr::get( + builder->getContext(), /*disable=*/disableAttr, /*count=*/countAttr, {}, + /*full=*/fullUnrollAttr, {}, {}, {}); + } + void addLoopAnnotationAttr( IncrementLoopInfo &info, llvm::SmallVectorImpl &dirs) { - mlir::BoolAttr f = mlir::BoolAttr::get(builder->getContext(), false); - mlir::BoolAttr t = mlir::BoolAttr::get(builder->getContext(), true); mlir::LLVM::LoopVectorizeAttr va; mlir::LLVM::LoopUnrollAttr ua; bool has_attrs = false; @@ -2182,20 +2210,15 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::common::visit( Fortran::common::visitors{ [&](const Fortran::parser::CompilerDirective::VectorAlways &) { + mlir::BoolAttr falseAttr = + mlir::BoolAttr::get(builder->getContext(), false); va = mlir::LLVM::LoopVectorizeAttr::get(builder->getContext(), - /*disable=*/f, {}, {}, - {}, {}, {}, {}); + /*disable=*/falseAttr, + {}, {}, {}, {}, {}, {}); has_attrs = true; }, [&](const Fortran::parser::CompilerDirective::Unroll &u) { - mlir::IntegerAttr countAttr; - if (u.v.has_value()) { - countAttr = builder->getIntegerAttr(builder->getI64Type(), - u.v.value()); - } - ua = mlir::LLVM::LoopUnrollAttr::get( - builder->getContext(), /*disable=*/f, /*count*/ countAttr, - {}, /*full*/ u.v.has_value() ? f : t, {}, {}, {}); + ua = genLoopUnrollAttr(u.v); has_attrs = true; }, [&](const auto &) {}}, diff --git a/flang/test/Integration/unroll.f90 b/flang/test/Integration/unroll.f90 index 9d69605e10d1b..aa47e465b63fc 100644 --- a/flang/test/Integration/unroll.f90 +++ b/flang/test/Integration/unroll.f90 @@ -3,14 +3,47 @@ ! CHECK-LABEL: unroll_dir subroutine unroll_dir integer :: a(10) - !dir$ unroll - ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION:.*]] + !dir$ unroll + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[UNROLL_ENABLE_FULL_ANNO:.*]] do i=1,10 - a(i)=i + a(i)=i end do end subroutine unroll_dir -! CHECK: ![[ANNOTATION]] = distinct !{![[ANNOTATION]], ![[UNROLL:.*]], ![[UNROLL_FULL:.*]]} -! CHECK: ![[UNROLL]] = !{!"llvm.loop.unroll.enable"} -! CHECK: ![[UNROLL_FULL]] = !{!"llvm.loop.unroll.full"} +! CHECK-LABEL: unroll_dir_0 +subroutine unroll_dir_0 + integer :: a(10) + !dir$ unroll 0 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[UNROLL_DISABLE_ANNO:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_dir_0 + +! CHECK-LABEL: unroll_dir_1 +subroutine unroll_dir_1 + integer :: a(10) + !dir$ unroll 1 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[UNROLL_DISABLE_ANNO]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_dir_1 + +! CHECK-LABEL: unroll_dir_2 +subroutine unroll_dir_2 + integer :: a(10) + !dir$ unroll 2 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[UNROLL_ENABLE_COUNT_2:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_dir_2 +! CHECK: ![[UNROLL_ENABLE_FULL_ANNO]] = distinct !{![[UNROLL_ENABLE_FULL_ANNO]], ![[UNROLL_ENABLE:.*]], ![[UNROLL_FULL:.*]]} +! CHECK: ![[UNROLL_ENABLE:.*]] = !{!"llvm.loop.unroll.enable"} +! CHECK: ![[UNROLL_FULL:.*]] = !{!"llvm.loop.unroll.full"} +! CHECK: ![[UNROLL_DISABLE_ANNO]] = distinct !{![[UNROLL_DISABLE_ANNO]], ![[UNROLL_DISABLE:.*]]} +! CHECK: ![[UNROLL_DISABLE]] = !{!"llvm.loop.unroll.disable"} +! CHECK: ![[UNROLL_ENABLE_COUNT_2]] = distinct !{![[UNROLL_ENABLE_COUNT_2]], ![[UNROLL_ENABLE]], ![[UNROLL_COUNT_2:.*]]} +! CHECK: ![[UNROLL_COUNT_2]] = !{!"llvm.loop.unroll.count", i32 2} From 7ae78a6cdb6ce9ad1534ed10519649fb3d47aca9 Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Tue, 11 Feb 2025 00:21:59 +0800 Subject: [PATCH 173/293] [mlir][vector]add extractInsertFoldConstantOp fold function and apply it to extractOp and insertOp. (#124399) add extractInsertFoldConstantOp fold function and apply it to extractOp and insertOp. --- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 45 +++++++++++++++++++ .../vector-to-llvm-interface.mlir | 42 +++++++++++++++++ mlir/test/Dialect/Vector/canonicalize.mlir | 26 +++++++++++ .../Vector/vector-warp-distribute.mlir | 3 +- 4 files changed, 114 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index b4a5461f4405d..94f9ead9e1665 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -1989,6 +1989,45 @@ static Value foldScalarExtractFromFromElements(ExtractOp extractOp) { return fromElementsOp.getElements()[flatIndex]; } +/// If the dynamic indices of `extractOp` or `insertOp` are in fact constants, +/// then fold it. +template +static Value extractInsertFoldConstantOp(OpType op, AdaptorType adaptor, + SmallVectorImpl &operands) { + std::vector staticPosition = op.getStaticPosition().vec(); + OperandRange dynamicPosition = op.getDynamicPosition(); + ArrayRef dynamicPositionAttr = adaptor.getDynamicPosition(); + + // If the dynamic operands is empty, it is returned directly. + if (!dynamicPosition.size()) + return {}; + + // `index` is used to iterate over the `dynamicPosition`. + unsigned index = 0; + + // `opChange` is a flag. If it is true, it means to update `op` in place. + bool opChange = false; + for (unsigned i = 0, e = staticPosition.size(); i < e; ++i) { + if (!ShapedType::isDynamic(staticPosition[i])) + continue; + Attribute positionAttr = dynamicPositionAttr[index]; + Value position = dynamicPosition[index++]; + if (auto attr = mlir::dyn_cast_if_present(positionAttr)) { + staticPosition[i] = attr.getInt(); + opChange = true; + continue; + } + operands.push_back(position); + } + + if (opChange) { + op.setStaticPosition(staticPosition); + op.getOperation()->setOperands(operands); + return op.getResult(); + } + return {}; +} + /// Fold an insert or extract operation into an poison value when a poison index /// is found at any dimension of the static position. static Attribute foldPoisonIndexInsertExtractOp(MLIRContext *context, @@ -2035,6 +2074,9 @@ OpFoldResult ExtractOp::fold(FoldAdaptor adaptor) { return val; if (auto val = foldScalarExtractFromFromElements(*this)) return val; + SmallVector operands = {getVector()}; + if (auto val = extractInsertFoldConstantOp(*this, adaptor, operands)) + return val; return OpFoldResult(); } @@ -3094,6 +3136,9 @@ OpFoldResult vector::InsertOp::fold(FoldAdaptor adaptor) { // (type mismatch). if (getNumIndices() == 0 && getSourceType() == getType()) return getSource(); + SmallVector operands = {getSource(), getDest()}; + if (auto val = extractInsertFoldConstantOp(*this, adaptor, operands)) + return val; if (auto res = foldPoisonIndexInsertExtractOp( getContext(), adaptor.getStaticPosition(), kPoisonIndex)) return res; diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir index d319b9043b4b8..d261327ec005f 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir @@ -530,6 +530,25 @@ func.func @extract_scalar_from_vec_0d_index(%arg0: vector) -> index { // ----- +func.func @extract_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const(%arg : vector<32x1xi32>) -> i32 { + %0 = arith.constant 0 : index + %1 = vector.extract %arg[%0, %0] : i32 from vector<32x1xi32> + return %1 : i32 +} + +// At compile time, since the indices of extractOp are constants, +// they will be collapsed and folded away; therefore, the lowering works. + +// CHECK-LABEL: @extract_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const +// CHECK-SAME: %[[ARG:.*]]: vector<32x1xi32>) -> i32 { +// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : vector<32x1xi32> to !llvm.array<32 x vector<1xi32>> +// CHECK: %[[VEC_0:.*]] = llvm.extractvalue %[[CAST]][0] : !llvm.array<32 x vector<1xi32>> +// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[RES:.*]] = llvm.extractelement %[[VEC_0]]{{\[}}%[[C0]] : i64] : vector<1xi32> +// CHECK: return %[[RES]] : i32 + +// ----- + //===----------------------------------------------------------------------===// // vector.insertelement //===----------------------------------------------------------------------===// @@ -781,6 +800,29 @@ func.func @insert_scalar_into_vec_2d_f32_dynamic_idx_scalable(%arg0: vector<1x[1 // ----- +func.func @insert_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const(%arg : vector<4x1xi32>) -> vector<4x1xi32> { + %0 = arith.constant 0 : index + %1 = arith.constant 1 : i32 + %res = vector.insert %1, %arg[%0, %0] : i32 into vector<4x1xi32> + return %res : vector<4x1xi32> +} + +// At compile time, since the indices of insertOp are constants, +// they will be collapsed and folded away; therefore, the lowering works. + +// CHECK-LABEL: @insert_scalar_from_vec_2d_f32_dynamic_idxs_compile_time_const +// CHECK-SAME: %[[ARG:.*]]: vector<4x1xi32>) -> vector<4x1xi32> { +// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : vector<4x1xi32> to !llvm.array<4 x vector<1xi32>> +// CHECK: %[[C1:.*]] = arith.constant 1 : i32 +// CHECK: %[[VEC_0:.*]] = llvm.extractvalue %[[CAST]][0] : !llvm.array<4 x vector<1xi32>> +// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[VEC_1:.*]] = llvm.insertelement %[[C1]], %[[VEC_0]]{{\[}}%[[C0]] : i64] : vector<1xi32> +// CHECK: %[[VEC_2:.*]] = llvm.insertvalue %[[VEC_1]], %[[CAST]][0] : !llvm.array<4 x vector<1xi32>> +// CHECK: %[[RES:.*]] = builtin.unrealized_conversion_cast %[[VEC_2]] : !llvm.array<4 x vector<1xi32>> to vector<4x1xi32> +// CHECK: return %[[RES]] : vector<4x1xi32> + +// ----- + //===----------------------------------------------------------------------===// // vector.type_cast // diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index a74e562ad2f68..93581cbfbe5e4 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -3171,3 +3171,29 @@ func.func @contiguous_scatter_step(%base: memref, memref, vector<16xindex>, vector<16xi1>, vector<16xf32> return } + +// ----- + +// CHECK-LABEL: @fold_extract_constant_indices +// CHECK-SAME: %[[ARG:.*]]: vector<32x1xi32>) -> i32 { +// CHECK: %[[RES:.*]] = vector.extract %[[ARG]][0, 0] : i32 from vector<32x1xi32> +// CHECK: return %[[RES]] : i32 +func.func @fold_extract_constant_indices(%arg : vector<32x1xi32>) -> i32 { + %0 = arith.constant 0 : index + %1 = vector.extract %arg[%0, %0] : i32 from vector<32x1xi32> + return %1 : i32 +} + +// ----- + +// CHECK-LABEL: @fold_insert_constant_indices +// CHECK-SAME: %[[ARG:.*]]: vector<4x1xi32>) -> vector<4x1xi32> { +// CHECK: %[[VAL:.*]] = arith.constant 1 : i32 +// CHECK: %[[RES:.*]] = vector.insert %[[VAL]], %[[ARG]] [0, 0] : i32 into vector<4x1xi32> +// CHECK: return %[[RES]] : vector<4x1xi32> +func.func @fold_insert_constant_indices(%arg : vector<4x1xi32>) -> vector<4x1xi32> { + %0 = arith.constant 0 : index + %1 = arith.constant 1 : i32 + %res = vector.insert %1, %arg[%0, %0] : i32 into vector<4x1xi32> + return %res : vector<4x1xi32> +} diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir index dbe0b39422369..38771f2593449 100644 --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -778,12 +778,11 @@ func.func @warp_constant(%laneid: index) -> (vector<1xf32>) { // CHECK-PROP-LABEL: func.func @vector_extract_1d( // CHECK-PROP-DAG: %[[C5_I32:.*]] = arith.constant 5 : i32 -// CHECK-PROP-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) { // CHECK-PROP: %[[V:.*]] = "some_def"() : () -> vector<64xf32> // CHECK-PROP: gpu.yield %[[V]] : vector<64xf32> // CHECK-PROP: } -// CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][%[[C1]]] : f32 from vector<2xf32> +// CHECK-PROP: %[[E:.*]] = vector.extract %[[R]][1] : f32 from vector<2xf32> // CHECK-PROP: %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle idx %[[E]], %[[C5_I32]] // CHECK-PROP: return %[[SHUFFLED]] : f32 func.func @vector_extract_1d(%laneid: index) -> (f32) { From 3706dfef660097f24fb5efbac0d7f14b424492ed Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 10 Feb 2025 16:29:42 +0000 Subject: [PATCH 174/293] [LV] Forget LCSSA phi with new pred before other SCEV invalidation. (#119897) `forgetLcssaPhiWithNewPredecessor` performs additional invalidation if there is an existing SCEV for the phi, but earlier `forgetBlockAndLoopDispositions` or `forgetLoop` may already invalidate the SCEV for the phi. Change the order to first call `forgetLcssaPhiWithNewPredecessor` to ensure it runs before its SCEV gets invalidated too eagerly. Fixes https://github.com/llvm/llvm-project/issues/119665. PR: https://github.com/llvm/llvm-project/pull/119897 --- .../Transforms/Vectorize/LoopVectorize.cpp | 8 +- ...idate-scev-at-scope-after-vectorization.ll | 118 ++++++++++++++++++ 2 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 610e4904a80ad..f2241be60ce05 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2937,10 +2937,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { if (EnableVPlanNativePath) fixNonInductionPHIs(State); - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - PSE.getSE()->forgetBlockAndLoopDispositions(); - // After vectorization, the exit blocks of the original loop will have // additional predecessors. Invalidate SCEVs for the exit phis in case SE // looked through single-entry phis. @@ -2950,6 +2946,10 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { for (PHINode &PN : Exit->phis()) PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); + // Forget the original basic block. + PSE.getSE()->forgetLoop(OrigLoop); + PSE.getSE()->forgetBlockAndLoopDispositions(); + // Don't apply optimizations below when no vector region remains, as they all // require a vector loop at the moment. if (!State.Plan->getVectorLoopRegion()) diff --git a/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll new file mode 100644 index 0000000000000..235a8f0fa34a8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/invalidate-scev-at-scope-after-vectorization.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print,loop-vectorize' -force-vector-width=4 -scalar-evolution-classify-expressions=false -S %s | FileCheck %s + +; Test case for https://github.com/llvm/llvm-project/issues/119665. + +; %loop.2's backedge-taken-count depends on %add.1 from %loop.1 via its +; corresponding SCEV at the scope of %loop.2. After vectorizing %loop.1, %add.1 +; isn't available at the entry of %loop.2 anymore and %add.1 at %loop.2's scope +; must be invalidated, as well as %loop.2's backedge-taken count. +define void @test_invalidate_scevs_at_scope(ptr %p) { +; CHECK-LABEL: define void @test_invalidate_scevs_at_scope( +; CHECK-SAME: ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: br i1 false, label %[[EXIT_1:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_1:.*]] +; CHECK: [[LOOP_1]]: +; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[TMP4]], [[IV_1]] +; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[IV_1]], 100 +; CHECK-NEXT: br i1 [[C_1]], label %[[EXIT_1]], label %[[LOOP_1]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT_1]]: +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD_1]], %[[LOOP_1]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[ADD_LCSSA]], i32 100) +; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[SMAX]], [[ADD_LCSSA]] +; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP7]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH2:.*]], label %[[VECTOR_PH3:.*]] +; CHECK: [[VECTOR_PH3]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP7]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP7]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY4:.*]] +; CHECK: [[VECTOR_BODY4]]: +; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ 0, %[[VECTOR_PH3]] ], [ [[INDEX_NEXT8:%.*]], %[[VECTOR_BODY4]] ] +; CHECK-NEXT: [[VEC_IND6:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH3]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VECTOR_BODY4]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0 +; CHECK-NEXT: store <4 x i64> [[VEC_IND6]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT7]] = add <4 x i64> [[VEC_IND6]], splat (i64 4) +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_BODY4]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK1]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP7]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT_2:.*]], label %[[SCALAR_PH2]] +; CHECK: [[SCALAR_PH2]]: +; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK1]] ], [ 0, %[[EXIT_1]] ] +; CHECK-NEXT: br label %[[LOOP_2:.*]] +; CHECK: [[LOOP_2]]: +; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL9]], %[[SCALAR_PH2]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_2]] ] +; CHECK-NEXT: [[IV_2_TRUNC:%.*]] = trunc i64 [[IV_2]] to i32 +; CHECK-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV_2]] +; CHECK-NEXT: store i64 [[IV_2]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_LCSSA]], [[IV_2_TRUNC]] +; CHECK-NEXT: [[C_2:%.*]] = icmp slt i32 [[ADD_2]], 100 +; CHECK-NEXT: br i1 [[C_2]], label %[[LOOP_2]], label %[[EXIT_2]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT_2]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.1 + +loop.1: + %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ] + %1 = load i32, ptr %p, align 4 + %add.1 = add i32 %1, %iv.1 + %iv.1.next = add i32 %iv.1, 1 + %c.1 = icmp eq i32 %iv.1, 100 + br i1 %c.1, label %exit.1, label %loop.1 + +exit.1: + %add.lcssa = phi i32 [ %add.1, %loop.1 ] + br label %loop.2 + +loop.2: + %iv.2 = phi i64 [ 0, %exit.1 ], [ %iv.2.next, %loop.2 ] + %iv.2.trunc = trunc i64 %iv.2 to i32 + %iv.2.next = add i64 %iv.2, 1 + %gep = getelementptr inbounds i64, ptr %p, i64 %iv.2 + store i64 %iv.2, ptr %gep + %add.2 = add i32 %add.lcssa, %iv.2.trunc + %c.2 = icmp slt i32 %add.2, 100 + br i1 %c.2, label %loop.2, label %exit.2 + +exit.2: + ret void +} + +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. From 62ae876b1ba2f03bb125174aa24e30b4ebd351a5 Mon Sep 17 00:00:00 2001 From: Tai Ly Date: Mon, 10 Feb 2025 10:36:29 -0600 Subject: [PATCH 175/293] [mlir][tosa] Fix conv op build functions (#126321) This patch fixes several issues: - buildConvOpWithQuantInfo: call buildConvOpResultTypeInfo to get final output type - buildTransConvOpWithQuantInfo: add input_zp and weight_zp operands remove input_zp/weight_zp attributes - createZeroPointTensor: add getElementTypeOrSelf to get element type just in case remove bad auto-merge lines Change-Id: Idbf88f500ce57a865da4b7be7b7b8bf2ba194b24 Signed-off-by: Tai Ly --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 37 +++++++++++++--------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 955021abdd67b..fd166cc1322ce 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -510,7 +510,13 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result, result.addAttribute("stride", stride); result.addAttribute("dilation", dilation); result.addAttribute("acc_type", accType); - result.addTypes(outputType); + Type finalOutputType = outputType; + auto quantAttr = buildConvOpQuantizationAttr(builder, input, weight); + if (quantAttr) { + finalOutputType = + buildConvOpResultTypeInfo(builder, outputType, input, weight); + } + result.addTypes(finalOutputType); } /// Handles tosa.transpose_conv2d which has outpad and output shape @@ -519,25 +525,19 @@ static void buildTransConvOpWithQuantInfo( OpBuilder &builder, OperationState &result, Type outputType, Value input, Value weight, Value bias, DenseI64ArrayAttr outpad, DenseI64ArrayAttr stride, DenseI64ArrayAttr outputShape, TypeAttr accType) { - result.addOperands({input, weight, bias}); + auto zps = createZPsAsConst(builder, input, weight); + result.addOperands({input, weight, bias, zps.first, zps.second}); result.addAttribute("out_pad", outpad); result.addAttribute("stride", stride); result.addAttribute("out_shape", outputShape); result.addAttribute("acc_type", accType); - auto quantAttr = ::buildConvOpQuantizationAttr(builder, input, weight); - + Type finalOutputType = outputType; + auto quantAttr = buildConvOpQuantizationAttr(builder, input, weight); if (quantAttr) { - result.addAttribute("input_zp", - builder.getI32IntegerAttr( - static_cast(quantAttr.getInputZp()))); - result.addAttribute("weight_zp", - builder.getI32IntegerAttr( - static_cast(quantAttr.getWeightZp()))); - result.addTypes( - buildConvOpResultTypeInfo(builder, outputType, input, weight)); - } else { - result.addTypes(outputType); + finalOutputType = + buildConvOpResultTypeInfo(builder, outputType, input, weight); } + result.addTypes(finalOutputType); } /// The tosa.fully_connected op has its own builder as it does not have @@ -2492,18 +2492,15 @@ LogicalResult mlir::tosa::getZeroPoint(ElementsAttr zpAttr, int64_t &zp) { return failure(); } -// Create a rank-0 const tensor for zero point of the source tensor. +// Create a rank-1 const tensor for zero point of the source tensor. std::optional mlir::tosa::createZeroPointTensor(OpBuilder &builder, Location loc, Type srcElemType, int64_t zp) { - if (auto quantType = - llvm::dyn_cast(srcElemType)) - srcElemType = quantType.getStorageType(); - - auto zpType = mlir::RankedTensorType::get({1}, srcElemType); + srcElemType = getElementTypeOrSelf(srcElemType); if (auto quantType = llvm::dyn_cast(srcElemType)) srcElemType = quantType.getStorageType(); + auto zpType = mlir::RankedTensorType::get({1}, srcElemType); if (llvm::isa(srcElemType)) { auto zpAttr = DenseElementsAttr::get( zpType, builder.getFloatAttr(srcElemType, static_cast(zp))); From 6a8439b5933e71d6dc93d5bdc921340efaa9522f Mon Sep 17 00:00:00 2001 From: Prashanth Date: Mon, 10 Feb 2025 22:28:46 +0530 Subject: [PATCH 176/293] [libc][docs] Add sys/statvfs to documentation and YAML definitions (#126413) These changes ensure that the sys/statvfs header is documented properly with respect to the issue ( https://github.com/llvm/llvm-project/issues/122006 ) . --- libc/docs/CMakeLists.txt | 1 + libc/docs/headers/index.rst | 1 + libc/utils/docgen/sys/statvfs.yaml | 11 +++++++++++ 3 files changed, 13 insertions(+) create mode 100644 libc/utils/docgen/sys/statvfs.yaml diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt index 97f27fe6a6e0c..4ef0e920de683 100644 --- a/libc/docs/CMakeLists.txt +++ b/libc/docs/CMakeLists.txt @@ -59,6 +59,7 @@ if (SPHINX_FOUND) sys/mman sys/resource sys/stat + sys/statvfs sys/time sys/wait termios diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst index 745b6f44750db..3dc30ef90a8e4 100644 --- a/libc/docs/headers/index.rst +++ b/libc/docs/headers/index.rst @@ -30,6 +30,7 @@ Implementation Status sys/mman sys/resource sys/stat + sys/statvfs sys/time sys/wait termios diff --git a/libc/utils/docgen/sys/statvfs.yaml b/libc/utils/docgen/sys/statvfs.yaml new file mode 100644 index 0000000000000..b13c3cb7203e5 --- /dev/null +++ b/libc/utils/docgen/sys/statvfs.yaml @@ -0,0 +1,11 @@ +functions: + fstatvfs: + in-latest-posix: '' + statvfs: + in-latest-posix: '' + +macros: + ST_RDONLY: + in-latest-posix: '' + ST_NOSUID: + in-latest-posix: '' From 5b9e6c7993359c16b4d645c851bb7fe2fd7b78c7 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Mon, 10 Feb 2025 18:03:44 +0100 Subject: [PATCH 177/293] [libc++] Improves type-safety in generator script. (#101880) This changes the code to use dataclasses instead of dict entries. It also adds type aliases to use in the typing information and updates the typing information. --- .../feature_test_macro/ftm_metadata.sh.py | 53 +++--- .../version_header_implementation.sh.py | 158 +++++++++--------- .../generate_feature_test_macro_components.py | 107 ++++++------ 3 files changed, 160 insertions(+), 158 deletions(-) diff --git a/libcxx/test/libcxx/feature_test_macro/ftm_metadata.sh.py b/libcxx/test/libcxx/feature_test_macro/ftm_metadata.sh.py index 541447a5916c0..4f23773f9a0a5 100644 --- a/libcxx/test/libcxx/feature_test_macro/ftm_metadata.sh.py +++ b/libcxx/test/libcxx/feature_test_macro/ftm_metadata.sh.py @@ -11,7 +11,7 @@ import sys sys.path.append(sys.argv[1]) -from generate_feature_test_macro_components import FeatureTestMacros +from generate_feature_test_macro_components import FeatureTestMacros, Metadata def test(output, expected): @@ -19,38 +19,29 @@ def test(output, expected): ftm = FeatureTestMacros(sys.argv[2]) + test( ftm.ftm_metadata, { - "__cpp_lib_any": { - "headers": ["any"], - "test_suite_guard": None, - "libcxx_guard": None, - }, - "__cpp_lib_barrier": { - "headers": ["barrier"], - "test_suite_guard": "!defined(_LIBCPP_VERSION) || (_LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_SYNC)", - "libcxx_guard": "_LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_SYNC", - }, - "__cpp_lib_format": { - "headers": ["format"], - "test_suite_guard": None, - "libcxx_guard": None, - }, - "__cpp_lib_parallel_algorithm": { - "headers": ["algorithm", "numeric"], - "test_suite_guard": None, - "libcxx_guard": None, - }, - "__cpp_lib_variant": { - "headers": ["variant"], - "test_suite_guard": None, - "libcxx_guard": None, - }, - "__cpp_lib_missing_FTM_in_older_standard": { - "headers": [], - "test_suite_guard": None, - "libcxx_guard": None, - }, + "__cpp_lib_any": Metadata( + headers=["any"], test_suite_guard=None, libcxx_guard=None + ), + "__cpp_lib_barrier": Metadata( + headers=["barrier"], + test_suite_guard="!defined(_LIBCPP_VERSION) || (_LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_SYNC)", + libcxx_guard="_LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_SYNC", + ), + "__cpp_lib_format": Metadata( + headers=["format"], test_suite_guard=None, libcxx_guard=None + ), + "__cpp_lib_parallel_algorithm": Metadata( + headers=["algorithm", "numeric"], test_suite_guard=None, libcxx_guard=None + ), + "__cpp_lib_variant": Metadata( + headers=["variant"], test_suite_guard=None, libcxx_guard=None + ), + "__cpp_lib_missing_FTM_in_older_standard": Metadata( + headers=[], test_suite_guard=None, libcxx_guard=None + ), }, ) diff --git a/libcxx/test/libcxx/feature_test_macro/version_header_implementation.sh.py b/libcxx/test/libcxx/feature_test_macro/version_header_implementation.sh.py index b9e087c210767..2771a2f7d8abf 100644 --- a/libcxx/test/libcxx/feature_test_macro/version_header_implementation.sh.py +++ b/libcxx/test/libcxx/feature_test_macro/version_header_implementation.sh.py @@ -16,7 +16,7 @@ del sys.argv[1:3] sys.path.append(UTILS) -from generate_feature_test_macro_components import FeatureTestMacros +from generate_feature_test_macro_components import FeatureTestMacros, VersionHeader class Test(unittest.TestCase): def setUp(self): @@ -27,114 +27,114 @@ def test_implementation(self): expected = { "17": [ { - "__cpp_lib_any": { - "value": "201606L", - "implemented": True, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_any": VersionHeader( + value="201606L", + implemented=True, + need_undef=False, + condition=None, + ), }, { - "__cpp_lib_parallel_algorithm": { - "value": "201603L", - "implemented": True, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_parallel_algorithm": VersionHeader( + value="201603L", + implemented=True, + need_undef=False, + condition=None, + ), }, { - "__cpp_lib_variant": { - "value": "202102L", - "implemented": True, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_variant": VersionHeader( + value="202102L", + implemented=True, + need_undef=False, + condition=None, + ), }, { - "__cpp_lib_missing_FTM_in_older_standard": { - "value": "2017L", - "implemented": False, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_missing_FTM_in_older_standard" : VersionHeader( + value = "2017L", + implemented = False, + need_undef = False, + condition = None, + ), }, ], "20": [ { - "__cpp_lib_barrier": { - "value": "201907L", - "implemented": True, - "need_undef": False, - "condition": "_LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_SYNC", - }, + "__cpp_lib_barrier": VersionHeader( + value="201907L", + implemented=True, + need_undef=False, + condition="_LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_SYNC", + ), }, { - "__cpp_lib_format": { - "value": "202110L", - "implemented": False, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_format": VersionHeader( + value="202110L", + implemented=False, + need_undef=False, + condition=None, + ), }, { - "__cpp_lib_variant": { - "value": "202106L", - "implemented": False, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_variant": VersionHeader( + value="202106L", + implemented=False, + need_undef=False, + condition=None, + ), }, { - "__cpp_lib_missing_FTM_in_older_standard": { - "value": "2020L", - "implemented": False, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_missing_FTM_in_older_standard" : VersionHeader( + value = "2020L", + implemented = False, + need_undef = False, + condition = None, + ), }, ], "23": [ { - "__cpp_lib_format": { - "value": "202207L", - "implemented": False, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_format": VersionHeader( + value="202207L", + implemented=False, + need_undef=False, + condition=None, + ), }, ], "26": [ { - "__cpp_lib_barrier": { - "value": "299900L", - "implemented": True, - "need_undef": True, - "condition": "_LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_SYNC", - }, + "__cpp_lib_barrier": VersionHeader( + value="299900L", + implemented=True, + need_undef=True, + condition="_LIBCPP_HAS_THREADS && _LIBCPP_AVAILABILITY_HAS_SYNC", + ), }, { - "__cpp_lib_format": { - "value": "202311L", - "implemented": False, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_format": VersionHeader( + value="202311L", + implemented=False, + need_undef=False, + condition=None, + ), }, { - "__cpp_lib_variant": { - "value": "202306L", - "implemented": False, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_variant": VersionHeader( + value="202306L", + implemented=False, + need_undef=False, + condition=None, + ), }, { - "__cpp_lib_missing_FTM_in_older_standard": { - "value": "2026L", - "implemented": False, - "need_undef": False, - "condition": None, - }, + "__cpp_lib_missing_FTM_in_older_standard" : VersionHeader( + value = "2026L", + implemented = False, + need_undef = False, + condition = None, + ), }, ], } diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 58ecd79cf7469..e9e531733abb5 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -2,8 +2,15 @@ import os from builtins import range +from dataclasses import dataclass from functools import reduce -from typing import Any, Dict, List # Needed for python 3.8 compatibility. +from typing import ( + Any, + Dict, + List, # Needed for python 3.8 compatibility. + NewType, + Optional, +) import functools import json @@ -1944,9 +1951,28 @@ def produce_docs(): f.write(doc_str) +Std = NewType("Std", str) # Standard version number +Ftm = NewType("Ftm", str) # The name of a feature test macro +Value = NewType("Value", str) # The value of a feature test macro including the L suffix + +@dataclass +class Metadata: + headers: list[str] = None + test_suite_guard: str = None + libcxx_guard: str = None + + +@dataclass +class VersionHeader: + value: Value = None + implemented: bool = None + need_undef: bool = None + condition: str = None + + def get_ftms( - data, std_dialects: List[str], use_implemented_status: bool -) -> Dict[str, Dict[str, Any]]: + data, std_dialects: List[Std], use_implemented_status: bool +) -> Dict[Ftm, Dict[Std, Optional[Value]]]: """Impementation for FeatureTestMacros.(standard|implemented)_ftms().""" result = dict() for feature in data: @@ -1983,7 +2009,7 @@ def get_ftms( return result -def generate_version_header_dialect_block(data: Dict[str, Any]) -> str: +def generate_version_header_dialect_block(data: Dict[Ftm, VersionHeader]) -> str: """Generates the contents of the version header for a dialect. This generates the contents of a @@ -1994,27 +2020,29 @@ def generate_version_header_dialect_block(data: Dict[str, Any]) -> str: result = "" for element in data: for ftm, entry in element.items(): - if not entry["implemented"]: + if not entry.implemented: # When a FTM is not implemented don't add the guards # or undefine the (possibly) defined macro. - result += f'// define {ftm} {entry["value"]}\n' + result += f"// define {ftm} {entry.value}\n" else: - need_undef = entry["need_undef"] - if entry["condition"]: - result += f'# if {entry["condition"]}\n' - if entry["need_undef"]: + need_undef = entry.need_undef + if entry.condition: + result += f"# if {entry.condition}\n" + if entry.need_undef: result += f"# undef {ftm}\n" - result += f'# define {ftm} {entry["value"]}\n' + result += f"# define {ftm} {entry.value}\n" result += f"# endif\n" else: - if entry["need_undef"]: + if entry.need_undef: result += f"# undef {ftm}\n" - result += f'# define {ftm} {entry["value"]}\n' + result += f"# define {ftm} {entry.value}\n" return result -def generate_version_header_implementation(data: Dict[str, Dict[str, Any]]) -> str: +def generate_version_header_implementation( + data: Dict[Std, Dict[Ftm, VersionHeader]] +) -> str: """Generates the body of the version header.""" template = """#if _LIBCPP_STD_VER >= {dialect} @@ -2132,7 +2160,7 @@ def __init__(self, filename: str): self.__data = json.load(f) @functools.cached_property - def std_dialects(self) -> List[str]: + def std_dialects(self) -> List[Std]: """Returns the C++ dialects avaiable. The available dialects are based on the 'c++xy' keys found the 'values' @@ -2151,63 +2179,44 @@ def std_dialects(self) -> List[str]: return sorted(list(dialects)) @functools.cached_property - def standard_ftms(self) -> Dict[str, Dict[str, Any]]: + def standard_ftms(self) -> Dict[Ftm, Dict[Std, Optional[Value]]]: """Returns the FTM versions per dialect in the Standard. This function does not use the 'implemented' flag. The output contains the versions used in the Standard. When a FTM in libc++ is not implemented according to the Standard to output may opt to show the expected value. - - The result is a dict with the following content - - key: Name of the feature test macro. - - value: A dict with the following content: - * key: The version of the C++ dialect. - * value: The value of the feature-test macro. """ return get_ftms(self.__data, self.std_dialects, False) @functools.cached_property - def implemented_ftms(self) -> Dict[str, Dict[str, Any]]: + def implemented_ftms(self) -> Dict[Ftm, Dict[Std, Optional[Value]]]: """Returns the FTM versions per dialect implemented in libc++. Unlike `get_std_dialect_versions` this function uses the 'implemented' flag. This returns the actual implementation status in libc++. - - The result is a dict with the following content - - key: Name of the feature test macro. - - value: A dict with the following content: - * key: The version of the C++ dialect. - * value: The value of the feature-test macro. When a feature-test - macro is not implemented its value is None. """ return get_ftms(self.__data, self.std_dialects, True) @functools.cached_property - def ftm_metadata(self) -> Dict[str, Dict[str, Any]]: + def ftm_metadata(self) -> Dict[Ftm, Metadata]: """Returns the metadata of the FTMs defined in the Standard. The metadata does not depend on the C++ dialect used. - The result is a dict with the following contents: - - key: Name of the feature test macro. - - value: A dict with the following content: - * headers: The list of headers that should provide the FTM - * test_suite_guard: The condition for testing the FTM in the test suite. - * test_suite_guard: The condition for testing the FTM in the version header. """ result = dict() for feature in self.__data: - entry = dict() - entry["headers"] = feature["headers"] - entry["test_suite_guard"] = feature.get("test_suite_guard", None) - entry["libcxx_guard"] = feature.get("libcxx_guard", None) - result[feature["name"]] = entry + result[feature["name"]] = Metadata( + feature["headers"], + feature.get("test_suite_guard", None), + feature.get("libcxx_guard", None), + ) return result @property - def version_header_implementation(self) -> Dict[str, List[Dict[str, Any]]]: + def version_header_implementation(self) -> Dict[Std, Dict[Ftm, VersionHeader]]: """Generates the body of the version header.""" result = dict() for std in self.std_dialects: @@ -2223,11 +2232,13 @@ def version_header_implementation(self) -> Dict[str, List[Dict[str, Any]]]: continue last_value = value - entry = dict() - entry["value"] = value - entry["implemented"] = self.implemented_ftms[ftm][std] == self.standard_ftms[ftm][std] - entry["need_undef"] = last_entry is not None and last_entry["implemented"] and entry["implemented"] - entry["condition"] = self.ftm_metadata[ftm]["libcxx_guard"] + implemented = self.implemented_ftms[ftm][std] == self.standard_ftms[ftm][std] + entry = VersionHeader( + value, + implemented, + last_entry is not None and last_entry.implemented and implemented, + self.ftm_metadata[ftm].libcxx_guard, + ) last_entry = entry result[get_std_number(std)].append(dict({ftm: entry})) From 62245aaa6b1983ceae768eaee30aa41c4dd6db51 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Mon, 10 Feb 2025 09:07:51 -0800 Subject: [PATCH 178/293] [RISCV] Improve Errors for GPRNoX0X2/SP Reg Classes (#126394) More adoption of better diagnostics for RISC-V register classes: - GPRNoX0X2 (GPRs excluding `zero` and `x2`, used for `c.lui`) - SP (only contains `sp`) --- llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 10 ++++++++-- llvm/test/MC/RISCV/rv32c-invalid.s | 6 +++--- llvm/test/MC/RISCV/xwchc-invalid.s | 8 ++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index e7e7a4b7d035b..6f2d0cf40352f 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -260,7 +260,10 @@ def GPRX5 : GPRRegisterClass<(add X5)> { def GPRNoX0 : GPRRegisterClass<(sub GPR, X0)>; -def GPRNoX0X2 : GPRRegisterClass<(sub GPR, X0, X2)>; +def GPRNoX0X2 : GPRRegisterClass<(sub GPR, X0, X2)> { + let DiagnosticType = "InvalidRegClassGPRNoX0X2"; + let DiagnosticString = "register must be a GPR excluding zero (x0) and sp (x2)"; +} def GPRX7 : GPRRegisterClass<(add X7)>; @@ -284,7 +287,10 @@ def GPRTC : GPRRegisterClass<(add (sequence "X%u", 6, 7), (sequence "X%u", 28, 31))>; def GPRTCNonX7 : GPRRegisterClass<(sub GPRTC, X7)>; -def SP : GPRRegisterClass<(add X2)>; +def SP : GPRRegisterClass<(add X2)> { + let DiagnosticType = "InvalidRegClassSP"; + let DiagnosticString = "register must be sp (x2)"; +} // Saved Registers from s0 to s7, for C.MVA01S07 instruction in Zcmp extension def SR07 : GPRRegisterClass<(add (sequence "X%u", 8, 9), diff --git a/llvm/test/MC/RISCV/rv32c-invalid.s b/llvm/test/MC/RISCV/rv32c-invalid.s index 9fbd5e9f117f9..d618a13ca4047 100644 --- a/llvm/test/MC/RISCV/rv32c-invalid.s +++ b/llvm/test/MC/RISCV/rv32c-invalid.s @@ -33,11 +33,11 @@ c.add zero, zero, sp # CHECK: :[[@LINE]]:14: error: invalid operand for instruc ## GPRNoX0X2 c.lui x0, 4 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RVC Hint Instructions{{$}} -c.lui x2, 4 # CHECK: :[[@LINE]]:7: error: invalid operand for instruction +c.lui x2, 4 # CHECK: :[[@LINE]]:7: error: register must be a GPR excluding zero (x0) and sp (x2){{$}} ## SP -c.addi4spn a0, a0, 12 # CHECK: :[[@LINE]]:17: error: invalid operand for instruction -c.addi16sp t0, 16 # CHECK: :[[@LINE]]:13: error: invalid operand for instruction +c.addi4spn a0, a0, 12 # CHECK: :[[@LINE]]:17: error: register must be sp (x2) +c.addi16sp t0, 16 # CHECK: :[[@LINE]]:13: error: register must be sp (x2) # Out of range immediates diff --git a/llvm/test/MC/RISCV/xwchc-invalid.s b/llvm/test/MC/RISCV/xwchc-invalid.s index 99cc519573895..a399e1cb66271 100644 --- a/llvm/test/MC/RISCV/xwchc-invalid.s +++ b/llvm/test/MC/RISCV/xwchc-invalid.s @@ -8,13 +8,13 @@ qk.c.sh x8, 1(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate must be a mul qk.c.lhu x8, 64(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate must be a multiple of 2 bytes in the range [0, 62] qk.c.sh x8, 64(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate must be a multiple of 2 bytes in the range [0, 62] -qk.c.lbusp x8, 0(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction -qk.c.sbsp x8, 0(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction +qk.c.lbusp x8, 0(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: register must be sp (x2) +qk.c.sbsp x8, 0(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: register must be sp (x2) qk.c.lbusp x8, 32(sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate must be an integer in the range [0, 15] qk.c.sbsp x8, 32(sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate must be an integer in the range [0, 15] -qk.c.lhusp x8, 0(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction -qk.c.shsp x8, 0(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction +qk.c.lhusp x8, 0(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: register must be sp (x2) +qk.c.shsp x8, 0(x8) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: register must be sp (x2) qk.c.lhusp x8, 1(sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate must be a multiple of 2 bytes in the range [0, 30] qk.c.shsp x8, 1(sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate must be a multiple of 2 bytes in the range [0, 30] qk.c.lhusp x8, 32(sp) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate must be a multiple of 2 bytes in the range [0, 30] From b319dfef21f6c7b0bc6a356d6b9f41a3b3b98ae9 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Mon, 10 Feb 2025 18:08:24 +0100 Subject: [PATCH 179/293] [libc++][CI] Updates Clang HEAD version in Docker. (#126419) This is a preparation to test Clang 21 in the CI, Drive-by: Updated some outdated documentation. --- libcxx/utils/ci/Dockerfile | 9 ++++----- libcxx/utils/ci/docker-compose.yml | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile index d0144908f098b..bd41f288686a4 100644 --- a/libcxx/utils/ci/Dockerfile +++ b/libcxx/utils/ci/Dockerfile @@ -7,18 +7,17 @@ # ===----------------------------------------------------------------------===## # # This file defines the buildkite and github actions builder images. -# You can build both images using: +# This images are tagged with . You can build both images using: # -# docker compose build +# TAG= docker compose build # # Or you can select a single image to build # -# docker compose build buildkite-builder +# TAG=test docker compose build actions-builder # # The final images can be found at # -# ghcr.io/libcxx/buildkite-builder -# ghcr.io/libcxx/actions-builder +# ghcr.io/libcxx/libcxx-linux-builder # ghcr.io/libcxx/android-buildkite-builder # # Members of the github.com/libcxx/ organizations can push new images to the CI. diff --git a/libcxx/utils/ci/docker-compose.yml b/libcxx/utils/ci/docker-compose.yml index 1e5d35a7fecdb..16db1b0e3acb3 100644 --- a/libcxx/utils/ci/docker-compose.yml +++ b/libcxx/utils/ci/docker-compose.yml @@ -1,6 +1,6 @@ x-versions: &compiler_versions GCC_LATEST_VERSION: 14 - LLVM_HEAD_VERSION: 20 + LLVM_HEAD_VERSION: 21 services: actions-builder: From 55015e150b35f69431ce1f906e22a598d5b2f000 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 10 Feb 2025 09:08:49 -0800 Subject: [PATCH 180/293] [NFC][TableGen] Delete `getLogicalOperandType` from InstrInfoEmitter (#125951) Delete `getLogicalOperandType` function from InstrInfoEmitter as no backend seems to use it. --- llvm/utils/TableGen/InstrInfoEmitter.cpp | 93 ------------------------ 1 file changed, 93 deletions(-) diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 14ef4e725493b..9533891878962 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -94,9 +94,6 @@ class InstrInfoEmitter { void emitLogicalOperandSizeMappings( raw_ostream &OS, StringRef Namespace, ArrayRef NumberedInstructions); - void emitLogicalOperandTypeMappings( - raw_ostream &OS, StringRef Namespace, - ArrayRef NumberedInstructions); // Operand information. unsigned CollectOperandInfo(OperandInfoListTy &OperandInfoList, @@ -563,93 +560,6 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings( OS << "#endif // GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n\n"; } -void InstrInfoEmitter::emitLogicalOperandTypeMappings( - raw_ostream &OS, StringRef Namespace, - ArrayRef NumberedInstructions) { - std::map, unsigned> LogicalOpTypeMap; - - std::map> InstMap; - - size_t OpTypeListSize = 0U; - std::vector LogicalOpTypeList; - for (const auto *Inst : NumberedInstructions) { - if (!Inst->TheDef->getValueAsBit("UseLogicalOperandMappings")) - continue; - - LogicalOpTypeList.clear(); - for (const auto &Op : Inst->Operands) { - auto *OpR = Op.Rec; - if ((OpR->isSubClassOf("Operand") || - OpR->isSubClassOf("RegisterOperand") || - OpR->isSubClassOf("RegisterClass")) && - !OpR->isAnonymous()) { - LogicalOpTypeList.push_back( - (Namespace + "::OpTypes::" + Op.Rec->getName()).str()); - } else { - LogicalOpTypeList.push_back("-1"); - } - } - OpTypeListSize = std::max(LogicalOpTypeList.size(), OpTypeListSize); - - auto I = - LogicalOpTypeMap.insert({LogicalOpTypeList, LogicalOpTypeMap.size()}) - .first; - InstMap[I->second].push_back( - (Namespace + "::" + Inst->TheDef->getName()).str()); - } - - OS << "#ifdef GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n"; - OS << "#undef GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n"; - OS << "namespace llvm::" << Namespace << " {\n"; - OS << "LLVM_READONLY static int\n"; - OS << "getLogicalOperandType(uint16_t Opcode, uint16_t LogicalOpIdx) {\n"; - if (!InstMap.empty()) { - std::vector *> LogicalOpTypeList( - LogicalOpTypeMap.size()); - for (auto &P : LogicalOpTypeMap) { - LogicalOpTypeList[P.second] = &P.first; - } - OS << " static const int TypeMap[][" << OpTypeListSize << "] = {\n"; - for (int r = 0, rs = LogicalOpTypeList.size(); r < rs; ++r) { - const auto &Row = *LogicalOpTypeList[r]; - OS << " {"; - int i, s = Row.size(); - for (i = 0; i < s; ++i) { - if (i > 0) - OS << ", "; - OS << Row[i]; - } - for (; i < static_cast(OpTypeListSize); ++i) { - if (i > 0) - OS << ", "; - OS << "-1"; - } - OS << "}"; - if (r != rs - 1) - OS << ","; - OS << "\n"; - } - OS << " };\n"; - - OS << " switch (Opcode) {\n"; - OS << " default: return -1;\n"; - for (auto &P : InstMap) { - auto OpMapIdx = P.first; - const auto &Insts = P.second; - for (const auto &Inst : Insts) { - OS << " case " << Inst << ":\n"; - } - OS << " return TypeMap[" << OpMapIdx << "][LogicalOpIdx];\n"; - } - OS << " }\n"; - } else { - OS << " return -1;\n"; - } - OS << "}\n"; - OS << "} // end namespace llvm::" << Namespace << "\n"; - OS << "#endif // GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n\n"; -} - void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS, StringRef TargetName) { ArrayRef TIIPredicates = @@ -1155,9 +1065,6 @@ void InstrInfoEmitter::run(raw_ostream &OS) { emitLogicalOperandSizeMappings(OS, TargetName, NumberedInstructions); } - Timer.startTimer("Emit logical operand type mappings"); - emitLogicalOperandTypeMappings(OS, TargetName, NumberedInstructions); - Timer.startTimer("Emit helper methods"); emitMCIIHelperMethods(OS, TargetName); From e42fdcb41fdcfe7bf302b40f20afb4e9cda5602d Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 11 Feb 2025 01:11:44 +0800 Subject: [PATCH 181/293] [RISCV] Match widening fp instructions with same fpext used in multiple operands (#125803) Because the fpext has a single use constraint on it we can't match cases where it's used for both operands. Introduce a new PatFrag that allows multiple uses on a single user and use it for the binary patterns, and some ternary patterns. (For some of the ternary patterns there is a fneg that counts as a separate user, we still need to handle these) --- .../Target/RISCV/RISCVInstrInfoVSDPatterns.td | 26 +++++++++---------- .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 5 ++++ llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll | 12 +++++++++ llvm/test/CodeGen/RISCV/rvv/vfwmacc-sdnode.ll | 25 ++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll | 12 +++++++++ llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll | 12 +++++++++ 6 files changed, 79 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 8f77b2ce34d1f..c588e047c2ac8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -536,19 +536,19 @@ multiclass VPatWidenBinaryFPSDNode_VV_VF { defvar wti = vtiToWti.Wti; let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in { - def : Pat<(op (wti.Vector (riscv_fpextend_vl_oneuse + def : Pat<(op (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs2), (vti.Mask true_mask), (XLenVT srcvalue))), - (wti.Vector (riscv_fpextend_vl_oneuse + (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), (XLenVT srcvalue)))), (!cast(instruction_name#"_VV_"#vti.LMul.MX) (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2, vti.RegClass:$rs1, vti.AVL, vti.Log2SEW, TA_MA)>; - def : Pat<(op (wti.Vector (riscv_fpextend_vl_oneuse + def : Pat<(op (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs2), (vti.Mask true_mask), (XLenVT srcvalue))), - (wti.Vector (riscv_fpextend_vl_oneuse + (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector (SplatFPOp vti.ScalarRegClass:$rs1)), (vti.Mask true_mask), (XLenVT srcvalue)))), (!cast(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX) @@ -571,10 +571,10 @@ multiclass VPatWidenBinaryFPSDNode_VV_VF_RM defvar wti = vtiToWti.Wti; let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in { - def : Pat<(op (wti.Vector (riscv_fpextend_vl_oneuse + def : Pat<(op (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs2), (vti.Mask true_mask), (XLenVT srcvalue))), - (wti.Vector (riscv_fpextend_vl_oneuse + (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), (XLenVT srcvalue)))), (!cast(instruction_name#"_VV_"#vti.LMul.MX#"_E"#vti.SEW) @@ -584,10 +584,10 @@ multiclass VPatWidenBinaryFPSDNode_VV_VF_RM // RISCVInsertReadWriteCSR FRM_DYN, vti.AVL, vti.Log2SEW, TA_MA)>; - def : Pat<(op (wti.Vector (riscv_fpextend_vl_oneuse + def : Pat<(op (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs2), (vti.Mask true_mask), (XLenVT srcvalue))), - (wti.Vector (riscv_fpextend_vl_oneuse + (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector (SplatFPOp (vti.Scalar vti.ScalarRegClass:$rs1))), (vti.Mask true_mask), (XLenVT srcvalue)))), (!cast(instruction_name#"_V"#vti.ScalarSuffix#"_"#vti.LMul.MX#"_E"#vti.SEW) @@ -669,10 +669,10 @@ multiclass VPatWidenFPMulAccSDNode_VV_VF_RM { defvar suffix = vti.LMul.MX # "_E" # vti.SEW; let Predicates = !listconcat(GetVTypePredicates.Predicates, GetVTypePredicates.Predicates) in { - def : Pat<(fma (wti.Vector (riscv_fpextend_vl_oneuse + def : Pat<(fma (wti.Vector (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs1), (vti.Mask true_mask), (XLenVT srcvalue))), - (riscv_fpextend_vl_oneuse (vti.Vector vti.RegClass:$rs2), - (vti.Mask true_mask), (XLenVT srcvalue)), + (riscv_fpextend_vl_sameuser (vti.Vector vti.RegClass:$rs2), + (vti.Mask true_mask), (XLenVT srcvalue)), (fneg wti.RegClass:$rd)), (!cast(instruction_name#"_VV_"#suffix) wti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 333ae52534681..f63c1560f6253 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -554,6 +554,11 @@ def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), return N->hasOneUse(); }]>; +def riscv_fpextend_vl_sameuser : PatFrag<(ops node:$A, node:$B, node:$C), + (riscv_fpextend_vl node:$A, node:$B, node:$C), [{ + return !N->use_empty() && all_equal(N->users()); +}]>; + def riscv_vfmadd_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, node:$E), (riscv_vfmadd_vl node:$A, node:$B, diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll index 68014ff4206f8..f7d287a088cc3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-sdnode.ll @@ -323,3 +323,15 @@ define @vfwadd_wf_nxv8f64_2( %va, flo %vd = fadd %va, %splat ret %vd } + +define @vfwadd_vv_nxv1f64_same_op( %va) { +; CHECK-LABEL: vfwadd_vv_nxv1f64_same_op: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwadd.vv v9, v8, v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %vb = fpext %va to + %vc = fadd %vb, %vb + ret %vc +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-sdnode.ll index f69b2346226ee..63113b8780989 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-sdnode.ll @@ -1764,3 +1764,28 @@ define @vfwnmsac_fv_nxv8f64( %va, @llvm.fma.v8f64( %vd, %vf, %va) ret %vg } + +define @vfwma_vv_nxv1f64_same_op( %va, %vb) { +; CHECK-LABEL: vfwma_vv_nxv1f64_same_op: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwmacc.vv v9, v8, v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %vc = fpext %va to + %vd = call @llvm.fma( %vc, %vc, %vb) + ret %vd +} + +define @vfwmsac_vv_nxv1f64_same_op( %va, %vb) { +; CHECK-LABEL: vfwmsac_vv_nxv1f64_same_op: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwmsac.vv v9, v8, v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %vc = fpext %va to + %vd = fneg %vb + %ve = call @llvm.fma( %vc, %vc, %vd) + ret %ve +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll index f00ff4b6d2cec..8cc8c5cffca6b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmul-sdnode.ll @@ -175,3 +175,15 @@ define @vfwmul_vf_nxv8f64_2( %va, floa %ve = fmul %vc, %splat ret %ve } + +define @vfwmul_vv_nxv1f64_same_op( %va) { +; CHECK-LABEL: vfwmul_vv_nxv1f64_same_op: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwmul.vv v9, v8, v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %vb = fpext %va to + %vc = fmul %vb, %vb + ret %vc +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll index b9f66d5d30825..d0cb64d986661 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwsub-sdnode.ll @@ -323,3 +323,15 @@ define @vfwsub_wf_nxv8f64_2( %va, flo %vd = fsub %va, %splat ret %vd } + +define @vfwsub_vv_nxv1f64_same_op( %va) { +; CHECK-LABEL: vfwsub_vv_nxv1f64_same_op: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwsub.vv v9, v8, v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %vb = fpext %va to + %vc = fsub %vb, %vb + ret %vc +} From 9e0077c921ebf505afa117cf520c628ace1a29cf Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Mon, 10 Feb 2025 18:14:09 +0100 Subject: [PATCH 182/293] [ValueTracking] Handle not in dominating condition. (#126423) General handling of not in dominating condition. proof: https://alive2.llvm.org/ce/z/FjJN8q --- llvm/lib/Analysis/ValueTracking.cpp | 11 +++++++++++ .../Transforms/InstCombine/fpclass-from-dom-cond.ll | 3 +-- llvm/test/Transforms/InstCombine/known-bits.ll | 3 +-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 8a9ad55366ee7..fb744d61aad63 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -801,6 +801,9 @@ static void computeKnownBitsFromCond(const Value *V, Value *Cond, if (auto *Cmp = dyn_cast(Cond)) computeKnownBitsFromICmpCond(V, Cmp, Known, SQ, Invert); + + if (Depth < MaxAnalysisRecursionDepth && match(Cond, m_Not(m_Value(A)))) + computeKnownBitsFromCond(V, A, Known, Depth + 1, SQ, !Invert); } void llvm::computeKnownBitsFromContext(const Value *V, KnownBits &Known, @@ -4934,6 +4937,11 @@ static void computeKnownFPClassFromCond(const Value *V, Value *Cond, KnownFromContext); return; } + if (Depth < MaxAnalysisRecursionDepth && match(Cond, m_Not(m_Value(A)))) { + computeKnownFPClassFromCond(V, A, Depth + 1, !CondIsTrue, CxtI, + KnownFromContext); + return; + } CmpPredicate Pred; Value *LHS; uint64_t ClassVal = 0; @@ -10272,6 +10280,9 @@ void llvm::findValuesAffectedByCondition( m_Value()))) { // Handle patterns that computeKnownFPClass() support. AddAffected(A); + } else if (!IsAssume && match(V, m_Not(m_Value(X)))) { + // Assume is checked here to avoid issues with ephemeral values + Worklist.push_back(X); } } } diff --git a/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll index e6df7fab356b4..934852d1ca8bc 100644 --- a/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll +++ b/llvm/test/Transforms/InstCombine/fpclass-from-dom-cond.ll @@ -528,8 +528,7 @@ define i1 @test_inv_and(float %x, i1 %cond2) { ; CHECK-NEXT: [[AND:%.*]] = and i1 [[COND2]], [[NOT]] ; CHECK-NEXT: br i1 [[AND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[RET1:%.*]] = fcmp oeq float [[X]], 0x7FF0000000000000 -; CHECK-NEXT: ret i1 [[RET1]] +; CHECK-NEXT: ret i1 false ; CHECK: if.else: ; CHECK-NEXT: ret i1 false ; diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index 7563a63f607f0..b729cbd971acc 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -2374,8 +2374,7 @@ define i8 @test_inv_cond_and(i8 %x, i1 %c) { ; CHECK-NEXT: [[COND:%.*]] = and i1 [[C:%.*]], [[NOT]] ; CHECK-NEXT: br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]] ; CHECK: if: -; CHECK-NEXT: [[OR1:%.*]] = or i8 [[X]], -4 -; CHECK-NEXT: ret i8 [[OR1]] +; CHECK-NEXT: ret i8 -4 ; CHECK: exit: ; CHECK-NEXT: [[OR2:%.*]] = or i8 [[X]], -4 ; CHECK-NEXT: ret i8 [[OR2]] From 0a470a926481d370251731cb2dd897531756335f Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 10 Feb 2025 09:21:31 -0800 Subject: [PATCH 183/293] [ELF] --package-metadata: support %[0-9a-fA-F][0-9a-fA-F] (This application-specific option is probably not appropriate as a linker option (.o file offers more flexibility and decouples JSON verification from linkers). However, the option has gained some traction in Linux distributions, with support in GNU ld, gold, and mold.) GNU ld has supported percent-encoded bytes and extensions like `%[comma]` since November 2024. mold supports just percent-encoded bytes. To prepare for potential adoption by Ubuntu, let's support percent-encoded bytes. Link: https://sourceware.org/bugzilla/show_bug.cgi?id=32003 Link: https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/2071468 Pull Request: https://github.com/llvm/llvm-project/pull/126396 --- lld/ELF/Config.h | 2 +- lld/ELF/Driver.cpp | 23 ++++++++++++++++++++++- lld/ELF/Options.td | 2 +- lld/docs/ld.lld.1 | 4 ++++ lld/test/ELF/package-metadata.s | 20 +++++++++++++++----- 5 files changed, 43 insertions(+), 8 deletions(-) diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 3cdb400e423fd..f132b11b20c63 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -411,7 +411,7 @@ struct Config { StringRef thinLTOJobs; unsigned timeTraceGranularity; int32_t splitStackAdjustSize; - StringRef packageMetadata; + SmallVector packageMetadata; // The following config options do not directly correspond to any // particular command line options. diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 3d6e022a89e5f..7d14180a49926 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -816,6 +816,26 @@ static ICFLevel getICF(opt::InputArgList &args) { return ICFLevel::All; } +static void parsePackageMetadata(Ctx &ctx, const opt::Arg &arg) { + unsigned c0, c1; + SmallVector decoded; + StringRef s = arg.getValue(); + for (size_t i = 0, e = s.size(); i != e; ++i) { + if (s[i] != '%') { + decoded.push_back(s[i]); + } else if (i + 2 < e && (c1 = hexDigitValue(s[i + 1])) != -1u && + (c0 = hexDigitValue(s[i + 2])) != -1u) { + decoded.push_back(uint8_t(c1 * 16 + c0)); + i += 2; + } else { + ErrAlways(ctx) << arg.getSpelling() << ": invalid % escape at byte " << i + << "; supports only %[0-9a-fA-F][0-9a-fA-F]"; + return; + } + } + ctx.arg.packageMetadata = std::move(decoded); +} + static StripPolicy getStrip(Ctx &ctx, opt::InputArgList &args) { if (args.hasArg(OPT_relocatable)) return StripPolicy::None; @@ -1425,7 +1445,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) { ctx.arg.optimize = args::getInteger(args, OPT_O, 1); ctx.arg.orphanHandling = getOrphanHandling(ctx, args); ctx.arg.outputFile = args.getLastArgValue(OPT_o); - ctx.arg.packageMetadata = args.getLastArgValue(OPT_package_metadata); + if (auto *arg = args.getLastArg(OPT_package_metadata)) + parsePackageMetadata(ctx, *arg); ctx.arg.pie = args.hasFlag(OPT_pie, OPT_no_pie, false); ctx.arg.printIcfSections = args.hasFlag(OPT_print_icf_sections, OPT_no_print_icf_sections, false); diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index 80032490da0de..b3b12a0646875 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -578,7 +578,7 @@ def z: JoinedOrSeparate<["-"], "z">, MetaVarName<"