From 501c77da6024959438c92f33bd997fe6f39e6b6c Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Mon, 17 Feb 2025 10:10:35 +0000 Subject: [PATCH 001/127] [LLD][ELF][ARM] Fix resolution of R_ARM_THM_JUMP8 and R_ARM_THM_JUMP11 for big endian (#126933) These relocations apply to 16-bit Thumb instructions, so reading 16 bits rather than 32 bits ensures the correct bits are masked and written back. This fixes the incorrect masking and aligns the relocation logic with the instruction encoding. Before this patch, 32 bits were read from the ELF object. This did not align with the instruction size of 16 bits, but the masking incidentally made it all work nonetheless. However, this was the case only in little endian. In big endian mode, the read 32-bit word had to have its bytes reversed. With this byte reordering, the masking would be applied to the wrong bits, hence causing the incorrect encoding to be produced as a result of the relocation resolution. The added test checks the result for both little and big endian modes. --- lld/ELF/Arch/ARM.cpp | 4 ++-- lld/test/ELF/arm-thumb-jump8-11.s | 32 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 lld/test/ELF/arm-thumb-jump8-11.s diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 7d2953ddf64f0..e667fdc0633c5 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -663,12 +663,12 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { case R_ARM_THM_JUMP8: // We do a 9 bit check because val is right-shifted by 1 bit. checkInt(ctx, loc, val, 9, rel); - write16(ctx, loc, (read32(ctx, loc) & 0xff00) | ((val >> 1) & 0x00ff)); + write16(ctx, loc, (read16(ctx, loc) & 0xff00) | ((val >> 1) & 0x00ff)); break; case R_ARM_THM_JUMP11: // We do a 12 bit check because val is right-shifted by 1 bit. checkInt(ctx, loc, val, 12, rel); - write16(ctx, loc, (read32(ctx, loc) & 0xf800) | ((val >> 1) & 0x07ff)); + write16(ctx, loc, (read16(ctx, loc) & 0xf800) | ((val >> 1) & 0x07ff)); break; case R_ARM_THM_JUMP19: // Encoding T3: Val = S:J2:J1:imm6:imm11:0 diff --git a/lld/test/ELF/arm-thumb-jump8-11.s b/lld/test/ELF/arm-thumb-jump8-11.s new file mode 100644 index 0000000000000..ed54f3c0cc945 --- /dev/null +++ b/lld/test/ELF/arm-thumb-jump8-11.s @@ -0,0 +1,32 @@ +# REQUIRES: arm + +# RUN: llvm-mc -triple thumbv6m-arm-eabi --filetype=obj %s -o %t.o +# RUN: ld.lld %t.o -o %t +# RUN: llvm-objdump -d %t --no-show-raw-insn | FileCheck %s --check-prefixes=CHECK,CHECK-LE + +# RUN: llvm-mc -triple thumbebv6m-arm-eabi --filetype=obj %s -o %t.o +# RUN: ld.lld %t.o -o %t +# RUN: llvm-objdump -d %t --no-show-raw-insn | FileCheck %s --check-prefixes=CHECK,CHECK-BE + +# CHECK-LE: file format elf32-littlearm +# CHECK-BE: file format elf32-bigarm + +# CHECK: Disassembly of section .text: + +# CHECK-LABEL: [[#%x,TARGET:]] : +# CHECK-NEXT: [[#TARGET]]: bx lr + +# CHECK-LABEL: <_start>: +# CHECK-NEXT: b 0x[[#TARGET]] +# CHECK-NEXT: beq 0x[[#TARGET]] + + .thumb + .section .text.1, "ax", %progbits +target: + bx lr + + .section .text.2, "ax", %progbits + .globl _start +_start: + b.n target // R_ARM_THM_JUMP11 + beq.n target // R_ARM_THM_JUMP8 From f378e52ed3c6f8da4973f97f1ef043c2eb0da721 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Mon, 17 Feb 2025 11:12:55 +0100 Subject: [PATCH 002/127] [clang][analysis] Fix flaky clang/test/Analysis/live-stmts.cpp test (2nd attempt) (#127406) In my previous attempt (#126913) of fixing the flaky case was on a good track when I used the begin locations as a stable ordering. However, I forgot to consider the case when the begin locations are the same among the Exprs. In an `EXPENSIVE_CHECKS` build, arrays are randomly shuffled prior to sorting them. This exposed the flaky behavior much more often basically breaking the "stability" of the vector - as it should. Because of this, I had to revert the previous fix attempt in #127034. To fix this, I use this time `Expr::getID` for a stable ID for an Expr. Hopefully fixes #126619 Hopefully fixes #126804 --- clang/lib/Analysis/LiveVariables.cpp | 8 ++--- clang/test/Analysis/live-stmts.cpp | 47 +++++++++++++--------------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/clang/lib/Analysis/LiveVariables.cpp b/clang/lib/Analysis/LiveVariables.cpp index af563702b77bf..c7d3451d37cf6 100644 --- a/clang/lib/Analysis/LiveVariables.cpp +++ b/clang/lib/Analysis/LiveVariables.cpp @@ -664,18 +664,18 @@ void LiveVariables::dumpExprLiveness(const SourceManager &M) { } void LiveVariablesImpl::dumpExprLiveness(const SourceManager &M) { - auto ByBeginLoc = [&M](const Expr *L, const Expr *R) { - return M.isBeforeInTranslationUnit(L->getBeginLoc(), R->getBeginLoc()); + const ASTContext &Ctx = analysisContext.getASTContext(); + auto ByIDs = [&Ctx](const Expr *L, const Expr *R) { + return L->getID(Ctx) < R->getID(Ctx); }; // Don't iterate over blockEndsToLiveness directly because it's not sorted. for (const CFGBlock *B : *analysisContext.getCFG()) { - llvm::errs() << "\n[ B" << B->getBlockID() << " (live expressions at block exit) ]\n"; std::vector LiveExprs; llvm::append_range(LiveExprs, blocksEndToLiveness[B].liveExprs); - llvm::sort(LiveExprs, ByBeginLoc); + llvm::sort(LiveExprs, ByIDs); for (const Expr *E : LiveExprs) { llvm::errs() << "\n"; E->dump(); diff --git a/clang/test/Analysis/live-stmts.cpp b/clang/test/Analysis/live-stmts.cpp index 9cac815e65de1..ca2ff6da8b133 100644 --- a/clang/test/Analysis/live-stmts.cpp +++ b/clang/test/Analysis/live-stmts.cpp @@ -1,6 +1,3 @@ -// Disabling this flaky test, see https://github.com/llvm/llvm-project/pull/126913#issuecomment-2655850766 -// UNSUPPORTED: true - // RUN: %clang_analyze_cc1 -w -analyzer-checker=debug.DumpLiveExprs %s 2>&1\ // RUN: | FileCheck %s @@ -29,36 +26,36 @@ int testThatDumperWorks(int x, int y, int z) { // CHECK-EMPTY: // CHECK: [ B2 (live expressions at block exit) ] // CHECK-EMPTY: -// CHECK-NEXT: ImplicitCastExpr {{.*}} -// CHECK-NEXT: `-ImplicitCastExpr {{.*}} -// CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' -// CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' // CHECK-EMPTY: -// CHECK-EMPTY: -// CHECK: [ B3 (live expressions at block exit) ] -// CHECK-EMPTY: // CHECK-NEXT: ImplicitCastExpr {{.*}} // CHECK-NEXT: `-ImplicitCastExpr {{.*}} // CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' // CHECK-EMPTY: -// CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: -// CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' +// CHECK: [ B3 (live expressions at block exit) ] // CHECK-EMPTY: +// CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: -// CHECK: [ B4 (live expressions at block exit) ] +// CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' // CHECK-EMPTY: // CHECK-NEXT: ImplicitCastExpr {{.*}} // CHECK-NEXT: `-ImplicitCastExpr {{.*}} // CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' // CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK: [ B4 (live expressions at block exit) ] +// CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' // CHECK-EMPTY: +// CHECK-NEXT: ImplicitCastExpr {{.*}} +// CHECK-NEXT: `-ImplicitCastExpr {{.*}} +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' +// CHECK-EMPTY: // CHECK-EMPTY: // CHECK: [ B5 (live expressions at block exit) ] // CHECK-EMPTY: @@ -228,15 +225,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -247,15 +244,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -266,15 +263,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -285,15 +282,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 From b3028295e741159e5014c126cd74988785fe8bdb Mon Sep 17 00:00:00 2001 From: josel-amd Date: Mon, 17 Feb 2025 11:23:27 +0100 Subject: [PATCH 003/127] [mlir][linalg] Remove `computeStaticLoopSizes` (#124778) `computeStaticLoopSizes()` is functionally identical to `getStaticLoopRanges()`. Replace all uses of `computeStaticLoopSizes()` by `getStaticLoopRanges()` and remove the former. --- .../mlir/Dialect/Linalg/IR/LinalgInterfaces.td | 5 ----- mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp | 13 ------------- mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp | 2 +- 3 files changed, 1 insertion(+), 19 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td index 98a5fd278a997..dbc1ac60e0973 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td @@ -863,11 +863,6 @@ def LinalgStructuredInterface /// `createFlatListOfOperandDims`. SmallVector createLoopRanges(OpBuilder &b, Location loc); - /// Compute the static loop sizes necessary to vectorize the computation. - /// This is done by applying `getShapesToLoopsMap` to - /// `createFlatListOfOperandStaticDims`. - SmallVector computeStaticLoopSizes(); - /// Returns the value that expresses the shape of the output in terms of /// shape of the input operands where possible LogicalResult reifyResultShapes(OpBuilder &b, diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp index 4185fcce393d5..466a9799295f9 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp @@ -1094,19 +1094,6 @@ SmallVector LinalgOp::createLoopRanges(OpBuilder &b, Location loc) { return res; } -SmallVector LinalgOp::computeStaticLoopSizes() { - AffineMap map = getLoopsToShapesMap(); - unsigned numDims = map.getNumDims(), numRes = map.getNumResults(); - SmallVector allShapeSizes = createFlatListOfOperandStaticDims(); - SmallVector res(numDims, 0); - for (unsigned idx = 0; idx < numRes; ++idx) { - auto result = map.getResult(idx); - if (auto d = dyn_cast(result)) - res[d.getPosition()] = allShapeSizes[idx]; - } - return res; -} - /// Visitor to check if any of the given set of positions from AffineDimExprs /// are used within an AffineExpr. struct HasAffineDimExprVisitor diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp index 2e6079e1402e1..b53180b5cf7c3 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ConstantFold.cpp @@ -130,7 +130,7 @@ class FoldConstantBase : public OpInterfaceRewritePattern { return failure(); } - SmallVector loopBounds = linalgOp.computeStaticLoopSizes(); + SmallVector loopBounds = linalgOp.getStaticLoopRanges(); int64_t numElements = outputType.getNumElements(); // Use APInt/APFloat instead of Attribute here for constructing the output. From f09fd94d6b40a80e18093fdfc7d9b199210f69fd Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 17 Feb 2025 11:24:43 +0100 Subject: [PATCH 004/127] [clang][bytecode] Restructure Program::CurrentDeclaration handling (#127456) Properly reset to the last ID and return the current ID from getCurrentDecl(). --- clang/lib/AST/ByteCode/Compiler.cpp | 2 +- clang/lib/AST/ByteCode/Program.h | 23 +++++++------------ .../AST/ByteCode/libcxx/global-decl-id.cpp | 22 ++++++++++++++++++ 3 files changed, 31 insertions(+), 16 deletions(-) create mode 100644 clang/test/AST/ByteCode/libcxx/global-decl-id.cpp diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 59c236c9da8c8..b3a81f8ff1516 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -29,7 +29,7 @@ namespace interp { template class DeclScope final : public LocalScope { public: DeclScope(Compiler *Ctx, const ValueDecl *VD) - : LocalScope(Ctx, VD), Scope(Ctx->P, VD), + : LocalScope(Ctx, VD), Scope(Ctx->P), OldInitializingDecl(Ctx->InitializingDecl) { Ctx->InitializingDecl = VD; Ctx->InitStack.push_back(InitLink::Decl(VD)); diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h index c9c3d20f198c6..d503652abb96f 100644 --- a/clang/lib/AST/ByteCode/Program.h +++ b/clang/lib/AST/ByteCode/Program.h @@ -132,20 +132,22 @@ class Program final { /// Context to manage declaration lifetimes. class DeclScope { public: - DeclScope(Program &P, const ValueDecl *VD) : P(P) { - P.startDeclaration(VD); + DeclScope(Program &P) : P(P), PrevDecl(P.CurrentDeclaration) { + ++P.LastDeclaration; + P.CurrentDeclaration = P.LastDeclaration; } - ~DeclScope() { P.endDeclaration(); } + ~DeclScope() { P.CurrentDeclaration = PrevDecl; } private: Program &P; + unsigned PrevDecl; }; /// Returns the current declaration ID. std::optional getCurrentDecl() const { if (CurrentDeclaration == NoDeclaration) - return std::optional{}; - return LastDeclaration; + return std::nullopt; + return CurrentDeclaration; } private: @@ -218,21 +220,12 @@ class Program final { } /// No declaration ID. - static constexpr unsigned NoDeclaration = (unsigned)-1; + static constexpr unsigned NoDeclaration = ~0u; /// Last declaration ID. unsigned LastDeclaration = 0; /// Current declaration ID. unsigned CurrentDeclaration = NoDeclaration; - /// Starts evaluating a declaration. - void startDeclaration(const ValueDecl *Decl) { - LastDeclaration += 1; - CurrentDeclaration = LastDeclaration; - } - - /// Ends a global declaration. - void endDeclaration() { CurrentDeclaration = NoDeclaration; } - public: /// Dumps the disassembled bytecode to \c llvm::errs(). void dump() const; diff --git a/clang/test/AST/ByteCode/libcxx/global-decl-id.cpp b/clang/test/AST/ByteCode/libcxx/global-decl-id.cpp new file mode 100644 index 0000000000000..0dd583c3d467f --- /dev/null +++ b/clang/test/AST/ByteCode/libcxx/global-decl-id.cpp @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -std=c++2c -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -std=c++2c -verify=ref,both %s + +// both-no-diagnostics + +namespace std { +constexpr int +midpoint(int __a, int ) { + constexpr unsigned __half_diff = 0; + return __half_diff; +} +} +struct Tuple { + int min; + int mid; + constexpr Tuple() { + min = 0; + mid = std::midpoint(min, min); + } +}; +constexpr Tuple tup; + From 9d24f943794420e512512eb9329341355e9289f8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 17 Feb 2025 10:31:31 +0000 Subject: [PATCH 005/127] [X86] combineConcatVectorOps - remove duplicate DAG.getContext() call. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 21b08a4a93fc7..85ad391ade299 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57582,7 +57582,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType(); if (SubVT.isSimple() && SubVT.isVector()) { EVT ConcatVT = - EVT::getVectorVT(*DAG.getContext(), SubVT.getScalarType(), + EVT::getVectorVT(Ctx, SubVT.getScalarType(), SubVT.getVectorElementCount() * Subs.size()); for (SDValue &Sub : Subs) Sub = DAG.getBitcast(SubVT, Sub); From 517800e37e8d3a4ee84214bef65e227612c2a98b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Mon, 17 Feb 2025 10:44:27 +0000 Subject: [PATCH 006/127] [mlir][tensor][linalg] Move Pack/UnPack Ops to Linalg (#123902) Moves `PackOp` and `UnPackOp` from the Tensor dialect to Linalg. This change was discussed in the following RFC: * https://discourse.llvm.org/t/rfc-move-tensor-pack-and-tensor-unpack-into-linalg This change involves significant churn but only relocates existing code - no new functionality is added. **Note for Downstream Users** Downstream users must update references to `PackOp` and `UnPackOp` as follows: * Code: `s/tensor::(Up)PackOp/linalg::(Un)PackOp/g` * Tests: `s/tensor.(un)pack/linalg.(un)pack/g` No other modifications should be required. --- .../mlir/Dialect/Linalg/IR/CMakeLists.txt | 7 + mlir/include/mlir/Dialect/Linalg/IR/Linalg.h | 3 + .../Dialect/Linalg/IR/LinalgInterfaces.td | 10 + .../Dialect/Linalg/IR/LinalgRelayoutOps.td | 336 +++++ .../Linalg/TransformOps/LinalgTransformOps.td | 86 +- .../Linalg/Transforms/TilingInterfaceImpl.h | 5 + .../Dialect/Linalg/Transforms/Transforms.h | 59 +- .../include/mlir/Dialect/Linalg/Utils/Utils.h | 18 + .../mlir/Dialect/Tensor/IR/CMakeLists.txt | 6 - mlir/include/mlir/Dialect/Tensor/IR/Tensor.h | 6 - .../Dialect/Tensor/IR/TensorInterfaces.td | 33 - .../mlir/Dialect/Tensor/IR/TensorOps.td | 310 ----- .../Tensor/TransformOps/TensorTransformOps.td | 10 - .../Dialect/Tensor/Transforms/Transforms.h | 9 - .../include/mlir/Dialect/Tensor/Utils/Utils.h | 19 - .../mlir/Dialect/Utils/ReshapeOpsUtils.h | 7 + mlir/lib/Dialect/Linalg/IR/CMakeLists.txt | 1 + mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp | 15 +- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 1095 ++++++++++++++++- .../TransformOps/LinalgTransformOps.cpp | 26 +- .../Linalg/Transforms/BlockPackMatmul.cpp | 2 +- .../Dialect/Linalg/Transforms/CMakeLists.txt | 1 + .../Transforms/DataLayoutPropagation.cpp | 120 +- .../Transforms/PackAndUnpackPatterns.cpp | 65 +- .../Linalg/Transforms/TilingInterfaceImpl.cpp | 655 ++++++++++ .../Dialect/Linalg/Transforms/Transforms.cpp | 40 +- .../Linalg/Transforms/Vectorization.cpp | 38 +- mlir/lib/Dialect/Linalg/Utils/Utils.cpp | 54 + mlir/lib/Dialect/Tensor/IR/CMakeLists.txt | 1 - mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp | 2 +- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 1078 +--------------- .../Tensor/IR/TensorTilingInterfaceImpl.cpp | 652 ---------- .../TransformOps/TensorTransformOps.cpp | 5 - .../Dialect/Tensor/Transforms/CMakeLists.txt | 1 - .../Tensor/Transforms/EmptyOpPatterns.cpp | 48 +- mlir/lib/Dialect/Tensor/Utils/Utils.cpp | 55 - mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp | 10 + .../Linalg/block-pack-matmul-layout.mlir | 36 +- .../Linalg/block-pack-matmul-padding.mlir | 20 +- .../Dialect/Linalg/block-pack-matmul.mlir | 90 +- mlir/test/Dialect/Linalg/canonicalize.mlir | 502 +++++++- .../Linalg/data-layout-propagation.mlir | 258 ++-- .../Linalg/decompose-tensor-pack-tile.mlir | 12 +- .../Dialect/Linalg/decompose-tensor-pack.mlir | 22 +- .../Linalg/decompose-tensor-unpack-tile.mlir | 12 +- .../Linalg/decompose-tensor-unpack.mlir | 18 +- mlir/test/Dialect/Linalg/fold-empty-op.mlir | 82 ++ mlir/test/Dialect/Linalg/invalid.mlir | 185 +++ mlir/test/Dialect/Linalg/named-ops.mlir | 105 ++ .../simplify-pack-unpack.mlir | 92 +- .../Dialect/Linalg/td/decompose-pack.mlir | 2 +- .../Dialect/Linalg/td/decompose-unpack.mlir | 2 +- .../Dialect/Linalg/transform-lower-pack.mlir | 172 +-- .../Dialect/Linalg/transform-op-fuse.mlir | 12 +- .../Dialect/Linalg/transform-op-pack.mlir | 124 +- .../Linalg/transform-op-tile-pack-unpack.mlir | 491 ++++++++ .../Linalg/transform-pack-greedily.mlir | 12 +- .../transform-tile-and-fuse-pack-unpack.mlir | 32 +- .../Linalg/vectorization-unsupported.mlir | 4 +- .../Linalg/vectorization-with-patterns.mlir | 8 +- mlir/test/Dialect/Linalg/vectorization.mlir | 48 +- mlir/test/Dialect/Tensor/canonicalize.mlir | 474 ------- mlir/test/Dialect/Tensor/fold-empty-op.mlir | 71 -- .../Tensor/fold-into-pack-and-unpack.mlir | 198 +-- mlir/test/Dialect/Tensor/invalid.mlir | 175 --- mlir/test/Dialect/Tensor/ops.mlir | 103 -- mlir/test/Dialect/Tensor/tiling.mlir | 492 -------- .../CPU/ArmSVE/pack-scalable-inner-tile.mlir | 8 +- .../Linalg/CPU/pack-dynamic-inner-tile.mlir | 8 +- .../Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir | 30 +- .../Linalg/CPU/unpack-dynamic-inner-tile.mlir | 8 +- .../tile-and-fuse-consumer.mlir | 16 +- .../tile-and-fuse-using-interface.mlir | 4 +- .../loop-invariant-code-motion.mlir | 20 +- .../Dialect/Linalg/TestLinalgTransforms.cpp | 28 +- .../Dialect/Tensor/TestTensorTransforms.cpp | 26 - 76 files changed, 4496 insertions(+), 4394 deletions(-) create mode 100644 mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td delete mode 100644 mlir/include/mlir/Dialect/Tensor/IR/TensorInterfaces.td rename mlir/lib/Dialect/{Tensor => Linalg}/Transforms/PackAndUnpackPatterns.cpp (90%) create mode 100644 mlir/test/Dialect/Linalg/fold-empty-op.mlir rename mlir/test/Dialect/{Tensor => Linalg}/simplify-pack-unpack.mlir (86%) create mode 100644 mlir/test/Dialect/Linalg/transform-op-tile-pack-unpack.mlir diff --git a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt index 71214b4404c55..efd708c5e5a11 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt @@ -65,6 +65,13 @@ add_public_tablegen_target(MLIRLinalgStructuredOpsIncGen) add_dependencies(MLIRLinalgStructuredOpsIncGen LinalgOdsGen) add_dependencies(mlir-headers MLIRLinalgStructuredOpsIncGen) +set(LLVM_TARGET_DEFINITIONS LinalgRelayoutOps.td) +mlir_tablegen(LinalgRelayoutOps.h.inc -gen-op-decls) +mlir_tablegen(LinalgRelayoutOps.cpp.inc -gen-op-defs) +add_public_tablegen_target(MLIRLinalgRelayoutOpsIncGen) +add_dependencies(MLIRLinalgRelayoutOpsIncGen LinalgOdsGen) +add_dependencies(mlir-headers MLIRLinalgRelayoutOpsIncGen) + set(LLVM_TARGET_DEFINITIONS LinalgInterfaces.td) mlir_tablegen(LinalgInterfaces.h.inc -gen-op-interface-decls) mlir_tablegen(LinalgInterfaces.cpp.inc -gen-op-interface-defs) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h b/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h index 85f5ebeb8081e..57bf6305a469d 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h +++ b/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h @@ -123,4 +123,7 @@ OpFoldResult createFoldedDimOp(OpBuilder &b, Location loc, Value val, #define GET_OP_CLASSES #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.h.inc" +#define GET_OP_CLASSES +#include "mlir/Dialect/Linalg/IR/LinalgRelayoutOps.h.inc" + #endif // MLIR_DIALECT_LINALG_IR_LINALG_H diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td index dbc1ac60e0973..247afc141c180 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td @@ -178,6 +178,16 @@ def LinalgConvolutionOpInterface : OpInterface<"ConvolutionOpInterface"> { ]; } +def LinalgRelayoutOpInterface : OpInterface<"RelayoutOpInterface"> { + let description = [{ + A Linalg relayout-op is either linalg.pack or linalg.unpack. + + While we could extend this interface with methods from Linalg_RelayoutOp, + this is currently not needed and left as a TODO. + }]; + let cppNamespace = "::mlir::linalg"; +} + def LinalgFillOpInterface : OpInterface<"FillOpInterface"> { let description = [{ A fill operation is defined in general terms: diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td new file mode 100644 index 0000000000000..a08a778fc25e1 --- /dev/null +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td @@ -0,0 +1,336 @@ +//===- LinalgReleayoutOps.td - Linalg relayout ops ---------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Pack + Unpack Ops that have been moved from the Tensor +// dialect. As such, these are defined as memory-effect-free and only accept +// "tensors" as inputs. +// +// TODO: Once a good motivating example is identified, relax these +// restrictions. +// +//===----------------------------------------------------------------------===// + +#ifndef LINALG_RELEAYOUT_OPS +#define LINALG_RELEAYOUT_OPS + +include "mlir/Dialect/Linalg/IR/LinalgBase.td" +include "mlir/Interfaces/DestinationStyleOpInterface.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Dialect/Linalg/IR/LinalgInterfaces.td" +include "mlir/IR/OpAsmInterface.td" + +//===----------------------------------------------------------------------===// +// RelayoutOp +//===----------------------------------------------------------------------===// + +class Linalg_RelayoutOp traits = []> : + Op, + DestinationStyleOpInterface, LinalgRelayoutOpInterface, + ConditionallySpeculatable, NoMemoryEffect, + DeclareOpInterfaceMethods, + TypesMatchWith<"result type matches type of dest", + "dest", "result", + "$_self">])> { + + code commonExtraClassDeclaration = [{ + size_t getSourceRank() { return getSourceType().getRank(); }; + size_t getDestRank() { return getDestType().getRank(); }; + RankedTensorType getSourceType() { + return ::llvm::cast(getSource().getType()); }; + RankedTensorType getDestType() { + return ::llvm::cast(getDest().getType()); }; + + MutableOperandRange getDpsInitsMutable() { return getDestMutable(); } + + /// Interface method for ConditionallySpeculatable. + Speculation::Speculatability getSpeculatability(); + + /// Return a mapping from positions `inner_dims_pos` to their + /// tile factors. + DenseMap getDimAndTileMapping(); + + /// Return the tile sizes as OpFoldResult. + SmallVector getMixedTiles(); + + /// Return the tile sizes as `int64_t`. If a tile size is dynamic + /// a sentinel `kDynamic` is introduced at that position in + /// the returned vector. + SmallVector getStaticTiles(); + + /// Retrieve all outer dims for this Pack/UnPack Op, i.e. all the leading + /// dims excluding the trailing dims corresponding to `innerTiles`. Note + /// that this will include both tiled and non-tiled dimensions. The order + /// of the output dimensions is consistent with the shape of the packed + /// tensor. + ArrayRef getAllOuterDims(); + + /// Similar to `getAllOuterDims`, but only retrieve the outer dims that + /// have been tiled. Also, the order of the output dimensions is consistent + /// with `inner_dims_pos` rather than the packed tensor. + SmallVector getTiledOuterDims(); + }]; + + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// +// PackOp +//===----------------------------------------------------------------------===// + +def Linalg_PackOp : Linalg_RelayoutOp<"pack", [ + AttrSizedOperandSegments]> { + let summary = "linalg.pack operation"; + let description = [{ + The "pack" operation converts a source tensor of rank `n` into a result + tensor of rank `n + k` with a tiled and packed layout (maybe with padding) + and optionally transposes the tiled source tensor dimensions. + + `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions that are + being tiled, where `0 < k <= n`. The order of the dimensions matters: + - The tiled dimensions (of size `inner_tiles`) are added to the end of the result + tensor in the order in which they appear in `inner_dims_pos`. + - `inner_dims_pos[i]` specifies the source tensor dimension tiled by + `inner_tiles[i]`. + + `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes + correspond to the least significant ("inner") result tensor dimension sizes, + in the same order. Tile sizes can be static or dynamic. + + Example: If `inner_tiles = [16, 32]`, the result tensor has a shape of + `...x16x32`. If `inner_dims_pos = [0, 1]`, the 0th source dimension is tiled + by 16 and the 1st source dimension is tiled by 32. Other source dimensions + (if any) are not tiled. If `inner_dims_pos = [1, 0]`, the 1st dimension is + tiled by 16 and the 0th dimension is tiled by 32. + + Example: + ```mlir + // NC to NCnc + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32] + into %dest : tensor<128x256xf32> -> tensor<16x8 x 8x32 xf32> + // \ / \ / + // outer dims inner dims + ``` + + `outer_dims_perm` (optional) specifies a permutation for the outer + dimensions. If specified, it must have `n` elements. + + Example: + ```mlir + // CK to KCck + %0 = linalg.pack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest + : tensor<128x256xf32> -> tensor<8x16 x 8x32 xf32> + // \ / + // compare with "NC to NCnc": outer dims are transposed + ``` + + `padding_value` specifies a padding value at the boundary on non-perfectly + divisible dimensions. Padding is optional: + - If absent, it is UB if the tile does not perfectly divide the dimension. + - If present, it will pad along high dimensions (high-padding) to make the + tile complete. + + Example: + ```mlir + %0 = linalg.pack %arg0 padding_value(%pad : f32) outer_dims_perm = [2, 1, 0] + inner_dims_pos = [1] inner_tiles = [2] into %arg1 + : tensor<200x127x256xf32> -> tensor<256x64x200x2xf32> + // \ + // padded and tiled dim + // + // Source dimension 1 is tiled. 64 does not divide 127 evenly, so 1 padded + // element is added at the end. + // + // Note: Only tiled dimensions can be padded. + ``` + }]; + let arguments = (ins AnyRankedTensor:$source, + AnyRankedTensor:$dest, + Optional:$padding_value, + DefaultValuedOptionalAttr:$outer_dims_perm, + DenseI64ArrayAttr:$inner_dims_pos, + Variadic:$inner_tiles, + DenseI64ArrayAttr:$static_inner_tiles); + let results = (outs AnyRankedTensor:$result); + let assemblyFormat = [{ + $source + (`padding_value` `(` $padding_value^ `:` type($padding_value) `)`)? + (`outer_dims_perm` `=` $outer_dims_perm^)? + `inner_dims_pos` `=` $inner_dims_pos + `inner_tiles` `=` + custom($inner_tiles, $static_inner_tiles) + `into` $dest attr-dict `:` type($source) `->` type($dest) + }]; + + let builders = [ + OpBuilder<(ins "Value":$source, "Value":$dest, + "ArrayRef":$innerDimsPos, + "ArrayRef":$innerTiles, + CArg<"std::optional", "std::nullopt">:$paddingValue, + CArg<"ArrayRef", "{}">:$outerDimsPerm)> + ]; + + let extraClassDeclaration = commonExtraClassDeclaration # [{ + // Method to get the shape of the result as `SmallVector`. + // This is a static method to allow getting the shape of the destination + // expected while creating a `pack` op. + static SmallVector getResultShape(OpBuilder &builder, + Location loc, ArrayRef sourceDims, + ArrayRef innerTileDims, ArrayRef innerDimsPos, + ArrayRef outerDimsPerm = {}); + + // Method to get the `RankedTensorType` of the result based on the inner + // tiles, position of the inner tiles (innerDimsPos) and interchange vector + // of outer loops (outerDimsPerm). + static RankedTensorType inferPackedType(RankedTensorType sourceType, + ArrayRef innerTileSizes, ArrayRef innerDimsPos, + ArrayRef outerDimsPerm = {}); + + // Returns true if we have enough static information to catch undefined + // behavior when the tile size does not divide perfectly the dimension of + // the input tensor. Detecting UB requires that the input size and either + // corresponding tile or output size are static. + static bool requirePaddingValue(ArrayRef inputShape, + ArrayRef innerDimsPos, + ArrayRef outputShape, + ArrayRef outerDimsPerm, + ArrayRef innerTiles); + + static Value createDestinationTensor(OpBuilder &b, Location loc, + Value source, ArrayRef innerTileSizes, + ArrayRef innerDimsPos, ArrayRef outerDimsPerm); + + /// Build and return a new PackOp that is a clone of the current PackOp with + /// (innerDimsPos, innerTiles) (resp. outerDimsPerm) are permuted by + /// innerPermutation (resp. outerPermutation). + /// A new `tensor.empty` of the proper shape is built in the process. + /// Asserts that: + /// - At least one of innerPermutation or outerPermutation is non-empty. + /// - If not empty, innerPermutation is a valid permutation of size + /// matching innerDimPos. + /// - If not empty, outerPermutation is a valid permutation of size + /// matching outerDimsPerm. + PackOp createTransposedClone(OpBuilder &b, + Location loc, + ArrayRef innerPermutation, + ArrayRef outerPermutation); + + /// Check if this PackOp is like a simple pad operation. + /// In other words, this operation: + /// 1. adds useless dimensions (dimension of size 1), + /// 2. pads the other ones, and + /// 3. doesn't shuffle the dimensions + bool isLikePad(); + }]; + + let hasCanonicalizeMethod = 1; + + let hasFolder = 1; +} + +//===----------------------------------------------------------------------===// +// UnPackOp +//===----------------------------------------------------------------------===// + +def Linalg_UnPackOp : Linalg_RelayoutOp<"unpack"> { + let summary = "linalg.unpack operation"; + let description = [{ + The "unpack" operation converts a source tensor of rank `n` with a tiled and + packed layout to a result tensor of rank `n - k`. + + `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions with + which the last `k` source tensor dimensions are combined, where + `0 < k <= n/2`. Each `inner_dims_pos` element must be `>= 0` and `< n - k`. + The order of the dimensions in `inner_dims_pos` matters: dimension + `inner_dims_pos[i]` is combined with dimension `n - k + i` (assuming that + `outer_dims_perm` is not specified). + + `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes + correspond to the least significant ("inner") source tensor dimension sizes. + The behavior of this op is undefined if: + - `inner_tiles` do not exactly match with the corresponding source tensor + dimension sizes. + - Or, `inner_tiles[i]` does not divide the size of dimension + `inner_dims_pos[i]` (assuming that `outer_dims_perm` is not specified) + evenly. + + `outer_dims_perm` (optional) specifies a permutation for the outer + dimensions. If specified, it must have `n - k` elements. If specified, this + permutation is applied before combining any dimensions. + + Example: + + ```mlir + // NCnc to NC: + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32] + into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32> + + // CK to KCck: + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest + : tensor<8x16x8x32xf32> -> tensor<128x256xf32> + ``` + }]; + let arguments = (ins AnyRankedTensor:$source, + AnyRankedTensor:$dest, + DefaultValuedOptionalAttr:$outer_dims_perm, + DenseI64ArrayAttr:$inner_dims_pos, + Variadic:$inner_tiles, + DenseI64ArrayAttr:$static_inner_tiles); + let results = (outs AnyRankedTensor:$result); + let assemblyFormat = [{ + $source + (`outer_dims_perm` `=` $outer_dims_perm^)? + `inner_dims_pos` `=` $inner_dims_pos + `inner_tiles` `=` + custom($inner_tiles, $static_inner_tiles) + `into` $dest attr-dict `:` type($source) `->` type($dest) + }]; + + let builders = [ + OpBuilder<(ins "Value":$source, "Value":$dest, + "ArrayRef":$innerDimsPos, + "ArrayRef":$innerTiles, + CArg<"ArrayRef", "{}">:$outerDimsPerm)> + ]; + + let extraClassDeclaration = commonExtraClassDeclaration # [{ + static Value createDestinationTensor(OpBuilder &b, Location loc, + Value source, ArrayRef innerTileSizes, + ArrayRef innerDimsPos, ArrayRef outerDimsPerm); + + /// Build and return a new UnPackOp that is a clone of the current UnPackOp + /// with (innerDimsPos, innerTiles) (resp. outerDimsPerm) are permuted by + /// innerPermutation (resp. outerPermutation). + /// Asserts that: + /// - At least one of innerPermutation or outerPermutation is non-empty. + /// - If not empty, innerPermutation is a valid permutation of size + /// matching innerDimPos. + /// - If not empty, outerPermutation is a valid permutation of size + /// matching outerDimsPerm. + UnPackOp createTransposedClone(OpBuilder &b, + Location loc, + Value transposedSource, + ArrayRef innerPermutation, + ArrayRef outerPermutation); + + /// Check if this UnPackOp is like a simple unpad operation. + /// In other words, this operation: + /// 1. drops useless dimensions (dimension of size 1), and + /// 2. reduces dimensions in place (i.e., no transpose.) + bool isLikeUnPad(); + }]; + + let hasCanonicalizeMethod = 1; + + let hasFolder = 1; +} + +#endif // LINALG_RELEAYOUT_OPS diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index e86d175489775..12080cee85c9d 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -45,7 +45,7 @@ def ApplyDecomposeTensorPackUnpackPatternsOp : Op]> { let description = [{ - Collect patterns to decompose tensor.pack and tensor.unpack into e.g. + Collect patterns to decompose linalg.pack and linalg.unpack into e.g. tensor::PadOp, linalg::transposeOp Ops. Requires all outer dims to be unit. }]; @@ -126,6 +126,28 @@ def ApplyPadVectorizationPatternsOp : Op]> { + let description = [{ + Indicates that operations like tensor.pad and tensor.extract_slice should + be folded into linalg.pack and linalg.unpack operations, respectively. + }]; + + let assemblyFormat = "attr-dict"; +} + +def ApplyFoldPackUnpackIntoEmptyPatternsOp : Op]> { + let description = [{ + // TODO: + }]; + + let arguments = (ins DefaultValuedAttr:$fold_single_use_only); + let assemblyFormat = "attr-dict"; +} + //===----------------------------------------------------------------------===// // BufferizeToAllocationOp //===----------------------------------------------------------------------===// @@ -547,19 +569,18 @@ def LowerPackOp : Op { let description = [{ - Rewrite a tensor.pack into tensor.pad + tensor.expand_shape + linalg.transpose. + Rewrite a linalg.pack into tensor.pad + tensor.expand_shape + linalg.transpose. #### Return modes - This operation ignores non-pack ops and drops them in the return. - This operation produces a silenceable failure if the rewrite fails for any - reason. - If all the operations referred to by the `target` are rewritten, the - transform succeeds. - Return handles to the newly produced pad, expand_shape and transpose ops. + This operation ignores non-pack ops and drops them in the return. This + operation produces a silenceable failure if the rewrite fails for any + reason. If all the operations referred to by the `target` are rewritten, + the transform succeeds. Return handles to the newly produced pad, + expand_shape and transpose ops. }]; - let arguments = (ins Transform_ConcreteOpType<"tensor.pack">:$target, + let arguments = (ins Transform_ConcreteOpType<"linalg.pack">:$target, DefaultValuedAttr:$lowerPadLikeWithInsertSlice); let results = (outs Transform_ConcreteOpType<"tensor.pad">:$pad_op, Transform_ConcreteOpType<"tensor.expand_shape">:$expand_shape_op, @@ -571,7 +592,7 @@ def LowerPackOp : Op { let description = [{ - Lower a tensor.unpack into empty + linalg.transpose + tensor.collapse_shape + + Lower a linalg.unpack into empty + linalg.transpose + tensor.collapse_shape + tensor.extract_slice. #### Return modes - This operation ignores non-unpack ops and drops them in the return. - This operation produces a silenceable failure if the rewrite fails for any - reason. - If all the operations referred to by the `target` are rewritten, the - transform succeeds. - Return handles to the newly produced empty, transpose, collapse_shape and extract_slice ops. + This operation ignores non-unpack ops and drops them in the return. This + operation produces a silenceable failure if the rewrite fails for any + reason. If all the operations referred to by the `target` are rewritten, + the transform succeeds. Return handles to the newly produced empty, + transpose, collapse_shape and extract_slice ops. }]; - let arguments = (ins Transform_ConcreteOpType<"tensor.unpack">:$target, + let arguments = (ins Transform_ConcreteOpType<"linalg.unpack">:$target, DefaultValuedAttr:$lowerUnpadLikeWithExtractSlice); let results = (outs Transform_ConcreteOpType<"tensor.empty">:$empty_op, Transform_ConcreteOpType<"linalg.transpose">:$transpose_op, @@ -613,7 +633,7 @@ def LowerUnPackOp : Op, ReportTrackingListenerFailuresOpTrait]> { let description = [{ - Apply a transposition to a single `tensor.pack` (resp. `tensor.unpack`) and + Apply a transposition to a single `linalg.pack` (resp. `linalg.unpack`) and update the `linalg.generic` op that consumes (resp. produces) the operation. This transform allows composing a simple `structured.pack` with additional @@ -989,19 +1009,19 @@ def PackTransposeOp : Op lowerPack(RewriterBase &rewriter, - tensor::PackOp packOp, + linalg::PackOp packOp, bool lowerPadLikeWithInsertSlice = true); struct LowerUnPackOpResult { @@ -1134,14 +1134,14 @@ struct LowerUnPackOpResult { /// Rewrite pack as empty + transpose + reshape + extract_slice. FailureOr -lowerUnPack(RewriterBase &rewriter, tensor::UnPackOp unPackOp, +lowerUnPack(RewriterBase &rewriter, linalg::UnPackOp unPackOp, bool lowerUnpadLikeWithExtractSlice = true); /// Struct to hold the result of a `pack` call. struct PackResult { - SmallVector packOps; + SmallVector packOps; linalg::LinalgOp packedLinalgOp; - SmallVector unPackOps; + SmallVector unPackOps; }; /// Implement packing of a single LinalgOp by `packedSizes`. /// There must be one packedSizes entry per `linalgOp` iterator. @@ -1151,9 +1151,9 @@ FailureOr pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp, /// Struct to hold the result of a `packTranspose` call. struct PackTransposeResult { - tensor::PackOp transposedPackOp; + linalg::PackOp transposedPackOp; linalg::LinalgOp transposedLinalgOp; - tensor::UnPackOp transposedUnPackOp; + linalg::UnPackOp transposedUnPackOp; }; /// Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the /// transposed PackOp -> LinalgOp -> UnPackOp chain after replacements. @@ -1164,8 +1164,8 @@ struct PackTransposeResult { /// 3. `outerPerm` (resp. `innerPerm`) must be valid permutations of /// `packOp.getOuterDimsPerm` (resp. `packOp.getInnerDimsPerm`) or empty. FailureOr -packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, - linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp, +packTranspose(RewriterBase &rewriter, linalg::PackOp packOp, + linalg::LinalgOp linalgOp, linalg::UnPackOp maybeUnPackOp, ArrayRef outerPerm, ArrayRef innerPerm); /// Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m @@ -1526,15 +1526,15 @@ struct DecomposePadOpPattern : public OpRewritePattern { const SmallVector &dynSizes) const; }; -/// Rewrites a tensor::PackOp into a sequence of: +/// Rewrites a linalg::PackOp into a sequence of: /// * tensor::PadOp + linalg::TransposeOp + tensor::EmptyOp + /// tensor::InsertSliceOp ops. /// -/// Requires that all the outer dims of the input tensor::PackOp are 1. +/// Requires that all the outer dims of the input linalg::PackOp are 1. /// /// Before: /// ``` -/// %packed = tensor.pack %input +/// %packed = linalg.pack %input /// padding_value(%pad : f32) /// inner_dims_pos = [1, 0] /// inner_tiles = [2, %high] @@ -1560,20 +1560,20 @@ struct DecomposePadOpPattern : public OpRewritePattern { /// : tensor<2x?xf32> into tensor<1x1x2x?xf32> /// ``` struct DecomposeOuterUnitDimsPackOpPattern - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(tensor::PackOp packOp, + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override; }; -/// Rewrites a tensor::UnPackOp into a sequence of rank-reduced +/// Rewrites a linalg::UnPackOp into a sequence of rank-reduced /// * tensor::ExtractSliceOp + linalg::TransposeOp + tensor::InsertSliceOp /// -/// Requires that all the outer dims of the input tensor::PackOp are 1. +/// Requires that all the outer dims of the input linalg::PackOp are 1. /// /// Before: /// ``` -/// %packed = tensor.unpack %input +/// %packed = linalg.unpack %input /// inner_dims_pos = [1, 0] /// inner_tiles = [2, 8] /// into %output : tensor<1x1x2x8xf32> -> tensor<5x1xf32> @@ -1594,9 +1594,9 @@ struct DecomposeOuterUnitDimsPackOpPattern /// : tensor<8x2xf32> to tensor<5x1xf32> /// ``` struct DecomposeOuterUnitDimsUnPackOpPattern - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp, + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(linalg::UnPackOp unpackOp, PatternRewriter &rewriter) const override; }; @@ -1718,7 +1718,7 @@ void populateLinalgGenericOpsSpecializationPatterns( void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, PatternBenefit benefit = 1); -/// Populates patterns to decompose tensor.pack and tensor.unpack Ops into e.g. +/// Populates patterns to decompose linalg.pack and linalg.unpack Ops into e.g. /// tensor.pad, linalg.transpose, tensor.{insert|extract}_slice. Require all /// outer dims to be unit. void populateDecomposePackUnpackPatterns(RewritePatternSet &patterns); @@ -1780,7 +1780,7 @@ void populateElementwiseOpsFusionPatterns( RewritePatternSet &patterns, const ControlFusionFn &controlElementwiseOpFusion); -/// Function type which is used to control propagation of tensor.pack/unpack +/// Function type which is used to control propagation of linalg.pack/unpack /// ops. using ControlPropagationFn = std::function; @@ -1889,6 +1889,19 @@ void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns); /// convert to a `linalg.dot`. void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns); +/// Populates `patterns` with patterns that fold operations like `tensor.pad` +/// and `tensor.extract_slice` into `tensor.pack` and `tensor.unpack` operations +/// respectively. +void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns); + +/// Populates `patterns` with patterns that fold operations like `linalg.pack` +/// and `linalg.unpack` into `tensor.empty`. +void populateFoldPackUnpackIntoTensorEmptyPatterns(RewritePatternSet &patterns); + +/// Populates `patterns` with patterns that simplify `tensor.pack` and +/// `tensor.unpack` operations. +void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns); + } // namespace linalg } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h index 1e4f3004dec7e..80aa034d2199d 100644 --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -33,6 +33,24 @@ namespace linalg { //===----------------------------------------------------------------------===// // Utilities for inferring various semantics properties of Linalg ops. //===----------------------------------------------------------------------===// +/// Shell function to compute the Destination Permutation of PackOp +/// This function uses the helper function `computePackUnPackPerm` to get +/// the permutation vector. Only major difference between UnPack and Pack is +/// that packOp uses destination rank whereas unpack Uses source rank. +SmallVector getPackInverseDestPerm(linalg::PackOp packOp); + +/// Shell function to compute the Source Permutation of unPackOp. +/// This function, like the getPackInverseDestPerm uses the helper function +/// computePackUnPackPerm` to get the permutation vector. +/// Only major difference between UnPack and Pack is that packOp uses +/// destination rank whereas unpack Uses source rank. +SmallVector getUnPackInverseSrcPerm(linalg::UnPackOp unpackOp); + +/// Shell function to compute the Source rank permutation for unpackOp +/// Unpack requires some packing metadata data information, so created +/// another function where this value is passed by reference. +SmallVector getUnPackInverseSrcPerm(linalg::UnPackOp, + PackingMetadata &metadata); //===----------------------------------------------------------------------===// // General utilities diff --git a/mlir/include/mlir/Dialect/Tensor/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Tensor/IR/CMakeLists.txt index 74a05291376b3..cd14fe5c04561 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Tensor/IR/CMakeLists.txt @@ -1,8 +1,2 @@ add_mlir_dialect(TensorOps tensor) add_mlir_doc(TensorOps TensorOps Dialects/ -gen-dialect-doc) - -set(LLVM_TARGET_DEFINITIONS TensorInterfaces.td) -mlir_tablegen(TensorInterfaces.h.inc -gen-op-interface-decls) -mlir_tablegen(TensorInterfaces.cpp.inc -gen-op-interface-defs) -add_public_tablegen_target(MLIRTensorInterfacesIncGen) -add_dependencies(mlir-headers MLIRTensorInterfacesIncGen) diff --git a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h index b3ec796a72337..eb550bb469b9f 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h +++ b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h @@ -46,12 +46,6 @@ SmallVector getOrCreateRanges(OffsetSizeAndStrideOpInterface op, #include "mlir/Dialect/Tensor/IR/TensorOpsDialect.h.inc" -//===----------------------------------------------------------------------===// -// Tensor Interfaces -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Tensor/IR/TensorInterfaces.h.inc" - //===----------------------------------------------------------------------===// // Tensor Dialect Operations //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorInterfaces.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorInterfaces.td deleted file mode 100644 index 522a9c56f3c92..0000000000000 --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorInterfaces.td +++ /dev/null @@ -1,33 +0,0 @@ -//===- TensorInterfaces.td - Tensor Interfaces Declaration -*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This is the definition file for the structured interface sfor Tensor ops. -// -//===----------------------------------------------------------------------===// - -#ifndef TENSOR_IR_TENSORINTERFACES -#define TENSOR_IR_TENSORINTERFACES - -include "mlir/Interfaces/DestinationStyleOpInterface.td" -include "mlir/IR/OpBase.td" - -// TODO: To be moved to LinalgInterfaces.td, see: -// * https://github.com/llvm/llvm-project/pull/123902 -// * https://discourse.llvm.org/t/rfc-move-tensor-pack-and-tensor-unpack-into-linalg/ -def TensorRelayoutOpInterface : OpInterface<"RelayoutOpInterface"> { - let description = [{ - A Tensor (soon to be Linalg) relayout-op is either tensor.pack or - tensor.unpack. - - While we could extend this interface with methods from Tensor_RelayoutOp, - this is currently not needed and left as a TODO. - }]; - let cppNamespace = "::mlir::tensor"; -} - -#endif // TENSOR_IR_TENSORINTERFACES diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td index f6927f5ebcfb8..35d0b16628417 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td +++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td @@ -10,7 +10,6 @@ #define TENSOR_OPS include "mlir/Dialect/Tensor/IR/TensorBase.td" -include "mlir/Dialect/Tensor/IR/TensorInterfaces.td" include "mlir/Interfaces/CastInterfaces.td" include "mlir/Interfaces/ControlFlowInterfaces.td" include "mlir/Interfaces/DestinationStyleOpInterface.td" @@ -1824,315 +1823,6 @@ def Tensor_SplatOp : Tensor_Op<"splat", [ let hasVerifier = 1; } -//===----------------------------------------------------------------------===// -// RelayoutOp -//===----------------------------------------------------------------------===// - -class Tensor_RelayoutOp traits = []> : - Tensor_Op, - DestinationStyleOpInterface, - ConditionallySpeculatable, NoMemoryEffect, - DeclareOpInterfaceMethods, - TensorRelayoutOpInterface, - TypesMatchWith<"result type matches type of dest", - "dest", "result", - "$_self">])> { - - code commonExtraClassDeclaration = [{ - size_t getSourceRank() { return getSourceType().getRank(); }; - size_t getDestRank() { return getDestType().getRank(); }; - RankedTensorType getSourceType() { - return ::llvm::cast(getSource().getType()); }; - RankedTensorType getDestType() { - return ::llvm::cast(getDest().getType()); }; - - MutableOperandRange getDpsInitsMutable() { return getDestMutable(); } - - /// Interface method for ConditionallySpeculatable. - Speculation::Speculatability getSpeculatability(); - - /// Return a mapping from positions `inner_dims_pos` to their - /// tile factors. - DenseMap getDimAndTileMapping(); - - /// Return the tile sizes as OpFoldResult. - SmallVector getMixedTiles(); - - /// Return the tile sizes as `int64_t`. If a tile size is dynamic - /// a sentinel `kDynamic` is introduced at that position in - /// the returned vector. - SmallVector getStaticTiles(); - - /// Retrieve all outer dims for this Pack/UnPack Op, i.e. all the leading - /// dims excluding the trailing dims corresponding to `innerTiles`. Note - /// that this will include both tiled and non-tiled dimensions. The order - /// of the output dimensions is consistent with the shape of the packed - /// tensor. - ArrayRef getAllOuterDims(); - - /// Similar to `getAllOuterDims`, but only retrieve the outer dims that - /// have been tiled. Also, the order of the output dimensions is consistent - /// with `inner_dims_pos` rather than the packed tensor. - SmallVector getTiledOuterDims(); - }]; - - let hasVerifier = 1; -} - -//===----------------------------------------------------------------------===// -// PackOp -//===----------------------------------------------------------------------===// - -def Tensor_PackOp : Tensor_RelayoutOp<"pack", [ - AttrSizedOperandSegments]> { - let summary = "tensor pack operation"; - let description = [{ - The "pack" operation converts a source tensor of rank `n` into a result - tensor of rank `n + k` with a tiled and packed layout (maybe with padding) - and optionally transposes the tiled source tensor dimensions. - - `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions that are - being tiled, where `0 < k <= n`. The order of the dimensions matters: - - The tiled dimensions (of size `inner_tiles`) are added to the end of the result - tensor in the order in which they appear in `inner_dims_pos`. - - `inner_dims_pos[i]` specifies the source tensor dimension tiled by - `inner_tiles[i]`. - - `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes - correspond to the least significant ("inner") result tensor dimension sizes, - in the same order. Tile sizes can be static or dynamic. - - Example: If `inner_tiles = [16, 32]`, the result tensor has a shape of - `...x16x32`. If `inner_dims_pos = [0, 1]`, the 0th source dimension is tiled - by 16 and the 1st source dimension is tiled by 32. Other source dimensions - (if any) are not tiled. If `inner_dims_pos = [1, 0]`, the 1st dimension is - tiled by 16 and the 0th dimension is tiled by 32. - - Example: - ```mlir - // NC to NCnc - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32] - into %dest : tensor<128x256xf32> -> tensor<16x8 x 8x32 xf32> - // \ / \ / - // outer dims inner dims - ``` - - `outer_dims_perm` (optional) specifies a permutation for the outer - dimensions. If specified, it must have `n` elements. - - Example: - ```mlir - // CK to KCck - %0 = tensor.pack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest - : tensor<128x256xf32> -> tensor<8x16 x 8x32 xf32> - // \ / - // compare with "NC to NCnc": outer dims are transposed - ``` - - `padding_value` specifies a padding value at the boundary on non-perfectly - divisible dimensions. Padding is optional: - - If absent, it is UB if the tile does not perfectly divide the dimension. - - If present, it will pad along high dimensions (high-padding) to make the - tile complete. - - Example: - ```mlir - %0 = tensor.pack %arg0 padding_value(%pad : f32) outer_dims_perm = [2, 1, 0] - inner_dims_pos = [1] inner_tiles = [2] into %arg1 - : tensor<200x127x256xf32> -> tensor<256x64x200x2xf32> - // \ - // padded and tiled dim - // - // Source dimension 1 is tiled. 64 does not divide 127 evenly, so 1 padded - // element is added at the end. - // - // Note: Only tiled dimensions can be padded. - ``` - }]; - let arguments = (ins AnyRankedTensor:$source, - AnyRankedTensor:$dest, - Optional:$padding_value, - DefaultValuedOptionalAttr:$outer_dims_perm, - DenseI64ArrayAttr:$inner_dims_pos, - Variadic:$inner_tiles, - DenseI64ArrayAttr:$static_inner_tiles); - let results = (outs AnyRankedTensor:$result); - let assemblyFormat = [{ - $source - (`padding_value` `(` $padding_value^ `:` type($padding_value) `)`)? - (`outer_dims_perm` `=` $outer_dims_perm^)? - `inner_dims_pos` `=` $inner_dims_pos - `inner_tiles` `=` - custom($inner_tiles, $static_inner_tiles) - `into` $dest attr-dict `:` type($source) `->` type($dest) - }]; - - let builders = [ - OpBuilder<(ins "Value":$source, "Value":$dest, - "ArrayRef":$innerDimsPos, - "ArrayRef":$innerTiles, - CArg<"std::optional", "std::nullopt">:$paddingValue, - CArg<"ArrayRef", "{}">:$outerDimsPerm)> - ]; - - let extraClassDeclaration = commonExtraClassDeclaration # [{ - // Method to get the shape of the result as `SmallVector`. - // This is a static method to allow getting the shape of the destination - // expected while creating a `pack` op. - static SmallVector getResultShape(OpBuilder &builder, - Location loc, ArrayRef sourceDims, - ArrayRef innerTileDims, ArrayRef innerDimsPos, - ArrayRef outerDimsPerm = {}); - - // Method to get the `RankedTensorType` of the result based on the inner - // tiles, position of the inner tiles (innerDimsPos) and interchange vector - // of outer loops (outerDimsPerm). - static RankedTensorType inferPackedType(RankedTensorType sourceType, - ArrayRef innerTileSizes, ArrayRef innerDimsPos, - ArrayRef outerDimsPerm = {}); - - // Returns true if we have enough static information to catch undefined - // behavior when the tile size does not divide perfectly the dimension of - // the input tensor. Detecting UB requires that the input size and either - // corresponding tile or output size are static. - static bool requirePaddingValue(ArrayRef inputShape, - ArrayRef innerDimsPos, - ArrayRef outputShape, - ArrayRef outerDimsPerm, - ArrayRef innerTiles); - - static Value createDestinationTensor(OpBuilder &b, Location loc, - Value source, ArrayRef innerTileSizes, - ArrayRef innerDimsPos, ArrayRef outerDimsPerm); - - /// Build and return a new PackOp that is a clone of the current PackOp with - /// (innerDimsPos, innerTiles) (resp. outerDimsPerm) are permuted by - /// innerPermutation (resp. outerPermutation). - /// A new `tensor.empty` of the proper shape is built in the process. - /// Asserts that: - /// - At least one of innerPermutation or outerPermutation is non-empty. - /// - If not empty, innerPermutation is a valid permutation of size - /// matching innerDimPos. - /// - If not empty, outerPermutation is a valid permutation of size - /// matching outerDimsPerm. - PackOp createTransposedClone(OpBuilder &b, - Location loc, - ArrayRef innerPermutation, - ArrayRef outerPermutation); - - /// Check if this PackOp is like a simple pad operation. - /// In other words, this operation: - /// 1. adds useless dimensions (dimension of size 1), - /// 2. pads the other ones, and - /// 3. doesn't shuffle the dimensions - bool isLikePad(); - }]; - - let hasCanonicalizeMethod = 1; - - let hasFolder = 1; -} - -//===----------------------------------------------------------------------===// -// UnPackOp -//===----------------------------------------------------------------------===// - -def Tensor_UnPackOp : Tensor_RelayoutOp<"unpack"> { - let summary = "tensor unpack operation"; - let description = [{ - The "unpack" operation converts a source tensor of rank `n` with a tiled and - packed layout to a result tensor of rank `n - k`. - - `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions with - which the last `k` source tensor dimensions are combined, where - `0 < k <= n/2`. Each `inner_dims_pos` element must be `>= 0` and `< n - k`. - The order of the dimensions in `inner_dims_pos` matters: dimension - `inner_dims_pos[i]` is combined with dimension `n - k + i` (assuming that - `outer_dims_perm` is not specified). - - `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes - correspond to the least significant ("inner") source tensor dimension sizes. - The behavior of this op is undefined if: - - `inner_tiles` do not exactly match with the corresponding source tensor - dimension sizes. - - Or, `inner_tiles[i]` does not divide the size of dimension - `inner_dims_pos[i]` (assuming that `outer_dims_perm` is not specified) - evenly. - - `outer_dims_perm` (optional) specifies a permutation for the outer - dimensions. If specified, it must have `n - k` elements. If specified, this - permutation is applied before combining any dimensions. - - Example: - - ```mlir - // NCnc to NC: - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32] - into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32> - - // CK to KCck: - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest - : tensor<8x16x8x32xf32> -> tensor<128x256xf32> - ``` - }]; - let arguments = (ins AnyRankedTensor:$source, - AnyRankedTensor:$dest, - DefaultValuedOptionalAttr:$outer_dims_perm, - DenseI64ArrayAttr:$inner_dims_pos, - Variadic:$inner_tiles, - DenseI64ArrayAttr:$static_inner_tiles); - let results = (outs AnyRankedTensor:$result); - let assemblyFormat = [{ - $source - (`outer_dims_perm` `=` $outer_dims_perm^)? - `inner_dims_pos` `=` $inner_dims_pos - `inner_tiles` `=` - custom($inner_tiles, $static_inner_tiles) - `into` $dest attr-dict `:` type($source) `->` type($dest) - }]; - - let builders = [ - OpBuilder<(ins "Value":$source, "Value":$dest, - "ArrayRef":$innerDimsPos, - "ArrayRef":$innerTiles, - CArg<"ArrayRef", "{}">:$outerDimsPerm)> - ]; - - let extraClassDeclaration = commonExtraClassDeclaration # [{ - static Value createDestinationTensor(OpBuilder &b, Location loc, - Value source, ArrayRef innerTileSizes, - ArrayRef innerDimsPos, ArrayRef outerDimsPerm); - - /// Build and return a new UnPackOp that is a clone of the current UnPackOp - /// with (innerDimsPos, innerTiles) (resp. outerDimsPerm) are permuted by - /// innerPermutation (resp. outerPermutation). - /// Asserts that: - /// - At least one of innerPermutation or outerPermutation is non-empty. - /// - If not empty, innerPermutation is a valid permutation of size - /// matching innerDimPos. - /// - If not empty, outerPermutation is a valid permutation of size - /// matching outerDimsPerm. - UnPackOp createTransposedClone(OpBuilder &b, - Location loc, - Value transposedSource, - ArrayRef innerPermutation, - ArrayRef outerPermutation); - - /// Check if this UnPackOp is like a simple unpad operation. - /// In other words, this operation: - /// 1. drops useless dimensions (dimension of size 1), and - /// 2. reduces dimensions in place (i.e., no transpose.) - bool isLikeUnPad(); - }]; - - let hasCanonicalizeMethod = 1; - - let hasFolder = 1; -} - //===----------------------------------------------------------------------===// // YieldOp //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td index 81bab1b0c82f7..fcb10f55d556d 100644 --- a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td +++ b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td @@ -53,16 +53,6 @@ def ApplyFoldTensorEmptyPatternsOp : Op:$fold_single_use_only); let assemblyFormat = "attr-dict"; } -def ApplyFoldIntoPackAndUnpackPatternsOp : Op]> { - let description = [{ - Indicates that operations like tensor.pad and tensor.extract_slice should - be folded into tensor.pack and tensor.unpack operations, respectively. - }]; - - let assemblyFormat = "attr-dict"; -} def ApplyFoldTensorSubsetOpsPatternsOp : Op; /// Populates `patterns` with patterns that replace tensor ops (such as diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h index ed1ec1e871482..83cc665b5a4fb 100644 --- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h @@ -42,25 +42,6 @@ FailureOr computeTransposedType(RankedTensorType rankedTensorType, ArrayRef transposeVector); -/// Shell function to compute the Destination Permutation of PackOp -/// This function uses the helper function `computePackUnPackPerm` to get -/// the permutation vector. Only major difference between UnPack and Pack is -/// that packOp uses destination rank whereas unpack Uses source rank. -SmallVector getPackInverseDestPerm(tensor::PackOp packOp); - -/// Shell function to compute the Source Permutation of unPackOp. -/// This function, like the getPackInverseDestPerm uses the helper function -/// computePackUnPackPerm` to get the permutation vector. -/// Only major difference between UnPack and Pack is that packOp uses -/// destination rank whereas unpack Uses source rank. -SmallVector getUnPackInverseSrcPerm(tensor::UnPackOp unpackOp); - -/// Shell function to compute the Source rank permutation for unpackOp -/// Unpack requires some packing metadata data information, so created -/// another function where this value is passed by reference. -SmallVector getUnPackInverseSrcPerm(tensor::UnPackOp, - PackingMetadata &metadata); - /// A tensor.insert_slice is a cast-like operation if it merely rank-extends the /// source tensor or inserts the source tensor into a destination tensor with /// the same shape. diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h index 3fa35bf1851a9..3af89a6ab3799 100644 --- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h +++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h @@ -568,6 +568,13 @@ struct PackingMetadata { // repeated N^2 counts). PackingMetadata computePackingMetadata(int64_t packedRank, ArrayRef innerDimPos); + +/// Try to remove a tensor operation if it would only reshape a constant. +/// Removes the op and replaces the constant with a new constant of the result +/// shape. When an optional cst attribute is passed, it is reshaped only if the +/// splat value matches the value in the attribute. +OpFoldResult reshapeConstantSource(DenseElementsAttr source, TensorType result, + std::optional cst = std::nullopt); } // namespace mlir #endif // MLIR_DIALECT_UTILS_RESHAPEOPSUTILS_H diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt index ce8dc6ccb0fa3..b4aeb44ac8faf 100644 --- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRLinalgDialect MLIRLinalgOpsEnumsIncGen MLIRLinalgOpsIncGen MLIRLinalgStructuredOpsIncGen + MLIRLinalgRelayoutOpsIncGen MLIRShardingInterfaceIncGen LINK_LIBS PUBLIC diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp index 9e50c355c5041..c256b18dd2b17 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp @@ -114,6 +114,10 @@ void mlir::linalg::LinalgDialect::initialize() { #define GET_OP_LIST #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" >(); + addOperations< +#define GET_OP_LIST +#include "mlir/Dialect/Linalg/IR/LinalgRelayoutOps.cpp.inc" + >(); // Fill the Linalg-specific OpName to RegionBuilder map. addNamedOpBuilders< @@ -130,13 +134,22 @@ void mlir::linalg::LinalgDialect::initialize() { >(); declarePromisedInterface(); declarePromisedInterface(); + + // ValueBoundsOpInterface declarePromisedInterface(); - declarePromisedInterface(); + declarePromisedInterface(); + + // Tiling Interface + declarePromisedInterface(); declarePromisedInterfaces(); + declarePromisedInterfaces(); declarePromisedInterfaces { /// 1. The pack op does not have padding value, or /// 2. The filled value and padding value are the same. static FailureOr foldFillPackIntoFillOp(RewriterBase &rewriter, - tensor::PackOp packOp) { + linalg::PackOp packOp) { auto fillOp = packOp.getSource().getDefiningOp(); if (!fillOp) return failure(); @@ -865,12 +866,12 @@ static FailureOr foldFillPackIntoFillOp(RewriterBase &rewriter, } /// Wrapper pattern that applies foldFillPackIntoFillOp method. -struct FoldFillWithPack : public OpRewritePattern { +struct FoldFillWithPack : public OpRewritePattern { public: FoldFillWithPack(MLIRContext *context) - : OpRewritePattern(context) {} + : OpRewritePattern(context) {} - LogicalResult matchAndRewrite(tensor::PackOp packOp, + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { auto fillOp = foldFillPackIntoFillOp(rewriter, packOp); if (failed(fillOp)) @@ -2289,6 +2290,8 @@ LogicalResult IndexOp::verify() { #define GET_OP_CLASSES #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" +#define GET_OP_CLASSES +#include "mlir/Dialect/Linalg/IR/LinalgRelayoutOps.cpp.inc" AffineMap mlir::linalg::extractOrIdentityMap(std::optional maybeMap, unsigned rank, @@ -3412,20 +3415,9 @@ FailureOr WinogradOutputTransformOp::getTiledImplementation( //===----------------------------------------------------------------------===// // LinalgDialect +// TODO: Merge with the LinalgDialect block at the bottom //===----------------------------------------------------------------------===// -void LinalgDialect::getCanonicalizationPatterns( - RewritePatternSet &results) const { - results.add(getContext()); -} - -Operation *LinalgDialect::materializeConstant(OpBuilder &builder, - Attribute value, Type type, - Location loc) { - return arith::ConstantOp::materialize(builder, value, type, loc); -} - // Returns true if the result expression of `subMap` are a subset of `fullMap`. static bool areResultExprsSubsetOf(AffineMap subMap, AffineMap fullMap) { auto explicitRange = subMap.getResults(); @@ -4059,5 +4051,1076 @@ Speculation::Speculatability BatchMatmulOp::getSpeculatability() { return getGenericSpeculatabilityImpl(cast(getOperation())); } +//===----------------------------------------------------------------------===// +// PackOp/UnPackOp Common +//===----------------------------------------------------------------------===// +// Given the (potentially) updated packed type, `newPackedTy`, generates an +// updated mixed-tile-sizes attribute. A tile size is updated only +// when: +// * a dim from newPackedTy is static, and +// * the corresponding size from mixedTiles is still dynamic. +// Otherwise, the original tile size is preserved. +// Note - packed-type-dim and mixed-tile-size should always match! +static SmallVector +getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy, + SmallVector mixedTiles) { + SmallVector newMixedTileSizes; + for (auto it : llvm::zip(cast(newPackedTy) + .getShape() + .take_back(mixedTiles.size()), + mixedTiles)) { + int64_t shape = std::get<0>(it); + if (shape == ShapedType::kDynamic) { + newMixedTileSizes.push_back(std::get<1>(it)); + continue; + } + + // If the current result dim is static, update the dynamic mixed-size + // (provided the original value is dynamic). + OpFoldResult tile = std::get<1>(it); + if (Attribute attr = llvm::dyn_cast_if_present(tile)) { + // Already a constant + newMixedTileSizes.push_back(tile); + } else { + assert(getConstantIntValue(tile).value() == shape && + "tile size and dim size don't match!"); + newMixedTileSizes.push_back( + (rewriter.getIntegerAttr(rewriter.getIndexType(), shape))); + } + } + + return newMixedTileSizes; +} + +template +static LogicalResult +reifyResultShapesImpl(OpTy op, OpBuilder &builder, + ReifiedRankedShapedTypeDims &reifiedReturnShapes) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + int64_t destRank = op.getDestRank(); + reifiedReturnShapes.resize(1, SmallVector(destRank)); + reifiedReturnShapes[0] = + tensor::getMixedSizes(builder, op.getLoc(), op.getDest()); + return success(); +} + +template +static DenseMap getDimAndTileMappingImpl(OpTy op) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + DenseMap dimAndTileMapping; + ArrayRef dimsToTile = op.getInnerDimsPos(); + SmallVector tiles = op.getMixedTiles(); + assert(tiles.size() == dimsToTile.size() && + "tiles must match indices of dimension to block"); + // bind the dimension `i` with the tile factor. + for (auto i : llvm::seq(0, dimsToTile.size())) + dimAndTileMapping[dimsToTile[i]] = tiles[i]; + return dimAndTileMapping; +} + +template +static SmallVector getMixedTilesImpl(OpTy op) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + Builder builder(op); + SmallVector mixedInnerTiles; + unsigned dynamicValIndex = 0; + for (int64_t staticTile : op.getStaticInnerTiles()) { + if (!ShapedType::isDynamic(staticTile)) + mixedInnerTiles.push_back(builder.getI64IntegerAttr(staticTile)); + else + mixedInnerTiles.push_back(op.getInnerTiles()[dynamicValIndex++]); + } + return mixedInnerTiles; +} + +template +static SmallVector getStaticTilesImpl(OpTy op) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + SmallVector dynamicTiles; + SmallVector staticTiles; + dispatchIndexOpFoldResults(op.getMixedTiles(), dynamicTiles, staticTiles); + return staticTiles; +} + +/// Returns true if `dimsPos` is invalid. It is invalid when: +/// a) It contains duplicate. +/// b) At least one dimension is out of bound (`dimPos` is >= 0 and < rank). +/// c) The number of elements in `dimsPos` is > than `rank`. +static bool isInvalidPackingPosSpecification(ArrayRef dimsPos, + size_t rank) { + size_t dimsPosSize = dimsPos.size(); + if (dimsPosSize > rank) + return true; + DenseSet uniqued; + for (int64_t dim : dimsPos) + uniqued.insert(dim); + if (dimsPosSize != uniqued.size()) + return true; + return llvm::any_of(dimsPos, [rank](int64_t dimPos) { + return dimPos < 0 || dimPos >= static_cast(rank); + }); +} + +/// Returns true if the dimension of `sourceShape` is smaller than the dimension +/// of the `limitShape`. +static bool areAllInBound(ArrayRef sourceShape, + ArrayRef limitShape) { + assert( + sourceShape.size() == limitShape.size() && + "expected source shape rank, and limit of the shape to have same rank"); + return llvm::all_of( + llvm::zip(sourceShape, limitShape), [](std::tuple it) { + int64_t sourceExtent = std::get<0>(it); + int64_t limit = std::get<1>(it); + return ShapedType::isDynamic(sourceExtent) || + ShapedType::isDynamic(limit) || sourceExtent <= limit; + }); +} + +template +static LogicalResult commonVerifierPackAndUnPackOp(OpTy packOrUnPack) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + Operation *op = packOrUnPack.getOperation(); + + // Return true if we have a zero-value tile. + auto hasZeros = [&](ArrayRef tiles) { + return llvm::any_of( + tiles, [](OpFoldResult tile) { return isConstantIntValue(tile, 0); }); + }; + + // Verify tiles. Do not allow zero tiles. + SmallVector mixedTiles = packOrUnPack.getMixedTiles(); + if (hasZeros(mixedTiles)) + return op->emitError("invalid zero tile factor"); + + // Verify inner_dims_pos and outer_dims_perm. + RankedTensorType unpackedType = (std::is_same::value) + ? packOrUnPack.getSourceType() + : packOrUnPack.getDestType(); + size_t unpackedRank = unpackedType.getRank(); + ArrayRef innerDimsPos = packOrUnPack.getInnerDimsPos(); + ArrayRef outerDimPerm = packOrUnPack.getOuterDimsPerm(); + if (isInvalidPackingPosSpecification(innerDimsPos, unpackedRank)) + return op->emitError("invalid inner_dims_pos vector"); + if (isInvalidPackingPosSpecification(outerDimPerm, unpackedRank)) + return op->emitError("invalid outer_dims_perm vector"); + if (!outerDimPerm.empty() && outerDimPerm.size() != unpackedRank) + return op->emitError("outer_dims_perm must be a permutation or empty"); + + // Tiling factors must be less than or equal to the input rank for pack (or + // output rank for unpack), and must match the number of `inner_dims_pos`. + if (mixedTiles.size() > unpackedRank) { + return op->emitError("tiling factors must be less than or equal to the " + "input rank for pack or output rank for unpack"); + } + if (mixedTiles.size() != innerDimsPos.size()) { + return op->emitError( + "tiling factors must equal the number of dimensions to tile"); + } + + ShapedType packedType = (std::is_same::value) + ? packOrUnPack.getDestType() + : packOrUnPack.getSourceType(); + size_t packedRank = packedType.getRank(); + // Require output rank to match input rank + number of blocking factors. + size_t expectedPackedRank = unpackedRank + mixedTiles.size(); + if (expectedPackedRank != packedRank) { + return op->emitError( + "packed rank != (unpacked rank + num tiling factors), got ") + << packedRank << " != " << expectedPackedRank; + } + + // Verify result shape is greater than the minimum expected + // by the pack operation, and that the output shape + // represents full tiles. + RankedTensorType expectedPackedType = PackOp::inferPackedType( + unpackedType, packOrUnPack.getStaticTiles(), innerDimsPos, outerDimPerm); + if (!areAllInBound(expectedPackedType.getShape(), packedType.getShape())) { + return op->emitError("the shape of output is not large enough to hold the " + "packed data. Expected at least ") + << expectedPackedType << ", got " << packedType; + } + if (!llvm::all_of( + llvm::zip(packedType.getShape().take_back(mixedTiles.size()), + mixedTiles), + [](std::tuple it) { + int64_t shape = std::get<0>(it); + if (Attribute attr = + llvm::dyn_cast_if_present(std::get<1>(it))) { + IntegerAttr intAttr = dyn_cast_or_null(attr); + int64_t staticTileSize = intAttr.getValue().getSExtValue(); + return shape == staticTileSize; + } + return ShapedType::isDynamic(shape); + })) { + return op->emitError("mismatch in inner tile sizes specified and shaped of " + "tiled dimension in the packed type"); + } + return success(); +} + +namespace { +/// Subset of PackOp/UnPackOp fields used to compute the result of applying +/// various permutations to the op. +// TODO: Add linalg.transpose + pack/unpack folding patterns that just reuse +// these. These may or may not become true foldings / canonicalizations +// depending on how aggressive we want to be in automatically folding +// transposes. +struct PackOrUnPackTransposeResult { + SmallVector innerDimsPos; + SmallVector innerTiles; + SmallVector outerDimsPerm; +}; +} // namespace + +template +static PackOrUnPackTransposeResult +commonPermutationOfPackAndUnPackOp(OpTy packOrUnPackOp, + ArrayRef innerPermutation, + ArrayRef outerPermutation) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + assert((!innerPermutation.empty() || !outerPermutation.empty()) && + "some permutation must be non-empty"); + PackOrUnPackTransposeResult metadata; + metadata.innerDimsPos = + SmallVector(packOrUnPackOp.getInnerDimsPos()); + metadata.innerTiles = + SmallVector(packOrUnPackOp.getMixedTiles()); + int64_t numOuterDims = std::is_same::value + ? packOrUnPackOp.getSourceRank() + : packOrUnPackOp.getDestRank(); + metadata.outerDimsPerm = + packOrUnPackOp.getOuterDimsPerm().empty() + ? llvm::to_vector(llvm::seq(0, numOuterDims)) + : SmallVector(packOrUnPackOp.getOuterDimsPerm()); + if (!innerPermutation.empty()) { + assert(innerPermutation.size() == metadata.innerDimsPos.size() && + isPermutationVector(innerPermutation) && + "invalid inner permutation"); + applyPermutationToVector(metadata.innerDimsPos, innerPermutation); + applyPermutationToVector(metadata.innerTiles, innerPermutation); + } + if (!outerPermutation.empty()) { + assert(outerPermutation.size() == metadata.outerDimsPerm.size() && + isPermutationVector(outerPermutation) && + "invalid outer permutation"); + applyPermutationToVector(metadata.outerDimsPerm, outerPermutation); + } + return metadata; +} + +//===----------------------------------------------------------------------===// +// PackOp +//===----------------------------------------------------------------------===// + +void PackOp::getAsmResultNames(function_ref setNameFn) { + setNameFn(getResult(), "pack"); +} + +void PackOp::build(OpBuilder &builder, OperationState &state, Value source, + Value dest, ArrayRef innerDimsPos, + ArrayRef innerTiles, + std::optional paddingValue, + ArrayRef outerDimsPerm) { + assert(innerDimsPos.size() == innerTiles.size() && + "number of tile sizes specified must match the specified number of " + "original dimensions to be tiled"); + SmallVector staticTileSizes; + SmallVector dynamicTileSizes; + dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes); + build(builder, state, dest.getType(), source, dest, + paddingValue ? *paddingValue : nullptr, + outerDimsPerm.empty() ? nullptr + : builder.getDenseI64ArrayAttr(outerDimsPerm), + builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, + builder.getDenseI64ArrayAttr(staticTileSizes)); +} + +LogicalResult +PackOp::reifyResultShapes(OpBuilder &builder, + ReifiedRankedShapedTypeDims &reifiedReturnShapes) { + return reifyResultShapesImpl(*this, builder, reifiedReturnShapes); +} + +DenseMap PackOp::getDimAndTileMapping() { + return getDimAndTileMappingImpl(*this); +} + +SmallVector PackOp::getMixedTiles() { + return getMixedTilesImpl(*this); +} + +SmallVector PackOp::getStaticTiles() { + return getStaticTilesImpl(*this); +} + +ArrayRef PackOp::getAllOuterDims() { + ShapedType inputType = getSourceType(); + int64_t inputRank = inputType.getRank(); + return getDestType().getShape().take_front(inputRank); +} + +SmallVector PackOp::getTiledOuterDims() { + auto innerDimsPos = getInnerDimsPos(); + auto packedShape = getDestType().getShape(); + SmallVector res; + + for (auto index : innerDimsPos) + res.push_back(packedShape[index]); + + return res; +} + +bool PackOp::requirePaddingValue(ArrayRef inputShape, + ArrayRef innerDimsPos, + ArrayRef outputShape, + ArrayRef outerDimsPerm, + ArrayRef innerTiles) { + SmallVector outputTileSizes( + outputShape.take_front(inputShape.size())); + if (!outerDimsPerm.empty()) { + assert(outerDimsPerm.size() == outputTileSizes.size() && + "expected output and outer_dims_perm to have same size"); + applyPermutationToVector(outputTileSizes, + invertPermutationVector(outerDimsPerm)); + } + for (auto [pos, tileSize] : llvm::zip_equal(innerDimsPos, innerTiles)) { + if (ShapedType::isDynamic(inputShape[pos])) + continue; + std::optional constantTile = getConstantIntValue(tileSize); + + if (!constantTile) { + if (!ShapedType::isDynamic(outputTileSizes[pos]) && + (inputShape[pos] % outputTileSizes[pos] != 0)) + return true; + } else if (inputShape[pos] % (*constantTile) != 0) { + return true; + } + } + return false; +} + +LogicalResult PackOp::verify() { + if (failed(commonVerifierPackAndUnPackOp(*this))) + return failure(); + + // Verify padding value, and bail out if the tile does not divide the + // dimension fully. In the case of dynamic tile factors or dimensions, having + // a partial tile is undefined behavior. + auto paddingValue = getPaddingValue(); + if (paddingValue && + paddingValue.getType() != getSourceType().getElementType()) { + return emitOpError("expected padding_value has ") + << getSourceType().getElementType() + << " but got: " << paddingValue.getType(); + } + + if (!paddingValue && + requirePaddingValue(getSourceType().getShape(), getInnerDimsPos(), + getDestType().getShape(), getOuterDimsPerm(), + getMixedTiles())) { + return emitOpError( + "invalid tile factor or output size provided. Only full tiles are " + "supported when padding_value is not set"); + } + return success(); +} + +/// Converts OpFoldResults to int64_t shape entries, unconditionally mapping all +/// Value's to kDynamic, even if they are arith.constant values. +static SmallVector +asShapeWithAnyValueAsDynamic(ArrayRef ofrs) { + SmallVector result; + for (auto o : ofrs) { + // Have to do this first, as getConstantIntValue special-cases constants. + if (llvm::dyn_cast_if_present(o)) + result.push_back(ShapedType::kDynamic); + else + result.push_back(getConstantIntValue(o).value_or(ShapedType::kDynamic)); + } + return result; +} + +/// Helper for PackOp::{getResultShape,inferPackedType}. Returns the shape of +/// the packed type. Having a shared helper helps implement these two methods in +/// a way that ensures that they agree on which dimensions are dynamic. +static SmallVector getPackOpResultTypeShape( + ArrayRef sourceShape, ArrayRef innerTileSizes, + ArrayRef innerDimsPos, ArrayRef outerDimsPerm) { + SmallVector resultShape = llvm::to_vector(sourceShape); + for (auto tiledDim : llvm::enumerate(llvm::to_vector(innerDimsPos))) { + if (ShapedType::isDynamic(resultShape[tiledDim.value()])) + continue; + if (ShapedType::isDynamic(innerTileSizes[tiledDim.index()])) { + resultShape[tiledDim.value()] = ShapedType::kDynamic; + continue; + } + resultShape[tiledDim.value()] = llvm::divideCeilSigned( + resultShape[tiledDim.value()], innerTileSizes[tiledDim.index()]); + } + + // Swap tile loops if outer_dims_perm is available. + if (!outerDimsPerm.empty()) + applyPermutationToVector(resultShape, outerDimsPerm); + + // Append the inner tile dimensions. + resultShape.append(innerTileSizes.begin(), innerTileSizes.end()); + return resultShape; +} + +SmallVector PackOp::getResultShape( + OpBuilder &builder, Location loc, ArrayRef sourceDims, + ArrayRef innerTileSizes, ArrayRef innerDimsPos, + ArrayRef outerDimsPerm) { + SmallVector resultDims = llvm::to_vector(sourceDims); + + AffineExpr s0, s1; + bindSymbols(builder.getContext(), s0, s1); + AffineExpr ceilDivExpr = s0.ceilDiv(s1); + for (auto tiledDim : llvm::enumerate(llvm::to_vector(innerDimsPos))) { + resultDims[tiledDim.value()] = affine::makeComposedFoldedAffineApply( + builder, loc, ceilDivExpr, + {resultDims[tiledDim.value()], innerTileSizes[tiledDim.index()]}); + } + if (!outerDimsPerm.empty()) + applyPermutationToVector(resultDims, outerDimsPerm); + resultDims.append(innerTileSizes.begin(), innerTileSizes.end()); + + SmallVector resultTypeShape = + getPackOpResultTypeShape(asShapeWithAnyValueAsDynamic(sourceDims), + asShapeWithAnyValueAsDynamic(innerTileSizes), + innerDimsPos, outerDimsPerm); + + // Fix-up `resultDims` to ensure that they are Value's if and only if the + // result type shape says it's a dynamic dim. This is needed as callers may + // use dispatchIndexOpFoldResults on the result, and rely on exact number of + // dynamic dims returned by that. + for (unsigned i = 0; i < resultDims.size(); ++i) { + if (!ShapedType::isDynamic(resultTypeShape[i])) + continue; + resultDims[i] = + getValueOrCreateConstantIndexOp(builder, loc, resultDims[i]); + } + + return resultDims; +} + +/// Get the expected packed type based on source type, tile factors, position of +/// the inner tiles and permutation of the outer tiled loop. +RankedTensorType PackOp::inferPackedType(RankedTensorType sourceType, + ArrayRef innerTileSizes, + ArrayRef innerDimsPos, + ArrayRef outerDimsPerm) { + SmallVector resultShape = getPackOpResultTypeShape( + sourceType.getShape(), innerTileSizes, innerDimsPos, outerDimsPerm); + return RankedTensorType::get(resultShape, sourceType.getElementType()); +} + +Value PackOp::createDestinationTensor(OpBuilder &b, Location loc, Value source, + ArrayRef innerTileSizes, + ArrayRef innerDimsPos, + ArrayRef outerDimsPerm) { + AffineExpr dim0, dim1; + bindDims(b.getContext(), dim0, dim1); + auto ceilDiv = [&](OpFoldResult v1, OpFoldResult v2) -> OpFoldResult { + return affine::makeComposedFoldedAffineApply(b, loc, dim0.ceilDiv(dim1), + {v1, v2}); + }; + + SmallVector mixedSizes; + for (auto [index, value] : llvm::enumerate( + llvm::cast(source.getType()).getShape())) { + if (ShapedType::isDynamic(value)) + mixedSizes.push_back( + b.create(loc, source, index).getResult()); + else + mixedSizes.push_back(b.getIndexAttr(value)); + } + for (auto it : llvm::zip(innerDimsPos, innerTileSizes)) { + int64_t dimPos = std::get<0>(it); + OpFoldResult tileSize = std::get<1>(it); + mixedSizes[dimPos] = ceilDiv(mixedSizes[dimPos], tileSize); + } + if (!outerDimsPerm.empty()) + applyPermutationToVector(mixedSizes, outerDimsPerm); + + mixedSizes.append(innerTileSizes.begin(), innerTileSizes.end()); + auto elemType = llvm::cast(source.getType()).getElementType(); + return b.create(loc, mixedSizes, elemType); +} + +PackOp PackOp::createTransposedClone(OpBuilder &b, Location loc, + ArrayRef innerPermutation, + ArrayRef outerPermutation) { + PackOrUnPackTransposeResult metadata = commonPermutationOfPackAndUnPackOp( + *this, innerPermutation, outerPermutation); + Value transposedDest = + createDestinationTensor(b, loc, getSource(), metadata.innerTiles, + metadata.innerDimsPos, metadata.outerDimsPerm); + return b.create(loc, getSource(), transposedDest, + metadata.innerDimsPos, metadata.innerTiles, + getPaddingValue(), metadata.outerDimsPerm); +} + +/// Returns true if the tiles and the tiled dims are constant. +template +bool areTilesAndTiledDimsAllConstant(OpTy op) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + ShapedType packedType = (std::is_same::value) + ? op.getDestType() + : op.getSourceType(); + SmallVector mixedTiles = op.getMixedTiles(); + for (auto [dimDest, tile] : llvm::zip( + packedType.getShape().take_back(mixedTiles.size()), mixedTiles)) { + std::optional constTileSize = getConstantIntValue(tile); + if (!constTileSize || ShapedType::isDynamic(dimDest)) + return false; + } + return true; +} + +Speculation::Speculatability PackOp::getSpeculatability() { + if (getPaddingValue()) + return Speculation::Speculatable; + + // The verifier rejects already operations if we can statically prove that the + // sizes of the tiles do not divide perfectly the dimension; thus, check only + // to have constant tiles and tiled inner dimensions. + if (!areTilesAndTiledDimsAllConstant(*this)) + return Speculation::NotSpeculatable; + + return Speculation::Speculatable; +} + +// Return true if `inner_dims_pos` and `outer_dims_perm` target the same +// dimensions for pack and unpack. +static bool hasSameInnerOuterAttribute(PackOp packOp, UnPackOp unPackOp) { + if (packOp.getInnerDimsPos() != unPackOp.getInnerDimsPos()) + return false; + if (packOp.getOuterDimsPerm() == unPackOp.getOuterDimsPerm()) + return true; + // Outer dims permutation is optional. + // To compare unbalanced pack-unpack pair, treat no permutation as equal to + // identity permutation. + return isIdentityPermutation(packOp.getOuterDimsPerm()) && + isIdentityPermutation(unPackOp.getOuterDimsPerm()); +} + +// Return true if pack and unpack have the same tiles. +// Same SSA values or same integer constants. +static bool haveSameTiles(PackOp packOp, UnPackOp unPackOp) { + auto packTiles = packOp.getMixedTiles(); + auto unPackTiles = unPackOp.getMixedTiles(); + if (packTiles.size() != unPackTiles.size()) + return false; + for (size_t i = 0, e = packTiles.size(); i < e; i++) { + if (!isEqualConstantIntOrValue(packTiles[i], unPackTiles[i])) + return false; + } + return true; +} + +/// Returns true if the pack op does not need a padding value. +static bool paddingIsNotNeeded(PackOp op) { + auto srcType = op.getSourceType(); + if (llvm::any_of(op.getInnerDimsPos(), + [&](int64_t pos) { return srcType.isDynamicDim(pos); })) + return false; + if (ShapedType::isDynamicShape(op.getStaticInnerTiles())) + return false; + return !PackOp::requirePaddingValue( + srcType.getShape(), op.getInnerDimsPos(), op.getDestType().getShape(), + op.getOuterDimsPerm(), op.getMixedTiles()); +} + +/// Returns true if the `srcShape` or `destShape` is different from the one in +/// `packOp` and populates each with the inferred static shape. +static bool inferStaticShape(PackOp packOp, SmallVectorImpl &srcShape, + SmallVectorImpl &destShape) { + bool changeNeeded = false; + srcShape.assign(packOp.getSourceType().getShape().begin(), + packOp.getSourceType().getShape().end()); + destShape.assign(packOp.getDestType().getShape().begin(), + packOp.getDestType().getShape().end()); + llvm::SmallSetVector innerDims; + innerDims.insert(packOp.getInnerDimsPos().begin(), + packOp.getInnerDimsPos().end()); + SmallVector inverseOuterDimsPerm; + if (!packOp.getOuterDimsPerm().empty()) + inverseOuterDimsPerm = invertPermutationVector(packOp.getOuterDimsPerm()); + int srcRank = packOp.getSourceRank(); + for (auto i : llvm::seq(0, srcRank)) { + if (innerDims.contains(i)) + continue; + int64_t srcPos = i; + int64_t destPos = i; + if (!inverseOuterDimsPerm.empty()) + destPos = inverseOuterDimsPerm[srcPos]; + if (ShapedType::isDynamic(srcShape[srcPos]) == + ShapedType::isDynamic(destShape[destPos])) { + continue; + } + int64_t size = srcShape[srcPos]; + if (ShapedType::isDynamic(size)) + size = destShape[destPos]; + srcShape[srcPos] = size; + destShape[destPos] = size; + changeNeeded = true; + } + return changeNeeded; +} + +LogicalResult PackOp::canonicalize(PackOp packOp, PatternRewriter &rewriter) { + // Fold an pack(unpack(x)) to x. + if (auto unPackOp = packOp.getSource().getDefiningOp()) { + if (unPackOp.getSourceType() != packOp.getDestType()) + return failure(); + if (packOp.getPaddingValue() || + !hasSameInnerOuterAttribute(packOp, unPackOp) || + !haveSameTiles(packOp, unPackOp)) + return failure(); + rewriter.replaceOp(packOp, unPackOp.getSource()); + return success(); + } + + // Fold optional PaddingValue operand away if padding is not needed. + if (packOp.getPaddingValue() && paddingIsNotNeeded(packOp)) { + rewriter.startOpModification(packOp); + packOp.getPaddingValueMutable().clear(); + rewriter.finalizeOpModification(packOp); + return success(); + } + + // Insert tensor.cast ops if static shape inference is available.. + SmallVector srcShape, destShape; + if (inferStaticShape(packOp, srcShape, destShape)) { + Location loc = packOp.getLoc(); + Value source = packOp.getSource(); + if (srcShape != packOp.getSourceType().getShape()) { + auto newSrcType = packOp.getSourceType().clone(srcShape); + source = + rewriter.create(loc, newSrcType, packOp.getSource()); + } + Value dest = packOp.getDest(); + RankedTensorType originalResultType = packOp.getDestType(); + bool needUpdateDestType = (destShape != originalResultType.getShape()); + if (needUpdateDestType) { + auto newDestType = packOp.getDestType().clone(destShape); + dest = + rewriter.create(loc, newDestType, packOp.getDest()); + } + rewriter.modifyOpInPlace(packOp, [&] { + packOp.getSourceMutable().assign(source); + packOp.getDestMutable().assign(dest); + packOp.getResult().setType(cast(dest.getType())); + }); + // Insert a cast if needed + if (needUpdateDestType) { + rewriter.setInsertionPointAfter(packOp); + auto castOp = + rewriter.create(loc, originalResultType, packOp); + rewriter.replaceAllUsesExcept(packOp, castOp, castOp); + } + return success(); + } + + return failure(); +} + +template +static bool isLikePadUnPad(PackOrUnpackOp packOp, + RankedTensorType packedTensorType) { + static_assert(std::is_same::value || + std::is_same::value, + "Function meant for pack/unpack"); + // This is a pad if packing only adds ones and we don't transpose dimensions. + + // Check that we are not transposing any dimensions. + ArrayRef innerDimsPos = packOp.getInnerDimsPos(); + int64_t numPackedDims = innerDimsPos.size(); + auto orderedDims = llvm::to_vector<4>(llvm::seq(0, numPackedDims)); + if (orderedDims != innerDimsPos) { + // Dimensions don't happen in order. + return false; + } + + ArrayRef packedShape = packedTensorType.getShape(); + int64_t packedRank = packedTensorType.getRank(); + // At this point we know that we are taking numPackedDims outer + // dimensions and pushing them all the way as the inner most dimensions. + // What's left on the outer most dimensions is, in this order: + // - the factor of the packed dimensions, then + // - the untouched dimensions + // This shifting inward of dimensions is a no-op (as opposed to a transpose) + // if all the dimensions that bubble outerward are ones. + // Therefore check that all the dimensions but the numPackedDims inner most + // ones are ones. + return llvm::all_of( + llvm::seq(0, packedRank - numPackedDims), + [&packedShape](int64_t i) { return packedShape[i] == 1; }); +} + +bool PackOp::isLikePad() { + auto packedTensorType = + llvm::cast((*this)->getResultTypes().front()); + return isLikePadUnPad(*this, packedTensorType); +} + +OpFoldResult PackOp::fold(FoldAdaptor adaptor) { + std::optional paddingValue; + if (auto pad = adaptor.getPaddingValue()) + paddingValue = pad; + if (OpFoldResult reshapedSource = reshapeConstantSource( + llvm::dyn_cast_if_present(adaptor.getSource()), + getDestType(), paddingValue)) + return reshapedSource; + return {}; +} + +/// Folds a tensor.cast op into a consuming PackOp op if the +/// `tensor.cast` has source that is more static than the consuming op. +/// +/// Example: +/// ```mlir +/// %1 = tensor.cast %0 : tensor<8x16xf32> to tensor +/// %2 = tensor.pack %1 ... : tensor ... +/// ``` +/// +/// folds into: +/// +/// ```mlir +/// %2 = tensor.pack %0 ... : tensor<8x16xf32> ... +/// ``` +struct FoldTensorCastPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(PackOp op, + PatternRewriter &rewriter) const override { + if (!tensor::hasFoldableTensorCastOperand(op)) + return failure(); + + SmallVector newResultTypes(op->getResultTypes()); + SmallVector newOperands = + tensor::getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); + + // Get the updated mixed-tile-sizes attribute. + SmallVector newMixedTileSizes = + getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles()); + + // Clone op. + // TODO: Strictly speaking, discardable attributes should be _discarded_ at + // this point. However, in practice, we use them for things that we'd like + // to preserve. Implement a better abstraction. + PackOp newOp = rewriter.create( + op.getLoc(), newOperands[0], newOperands[1], op.getInnerDimsPos(), + newMixedTileSizes, op.getPaddingValue(), op.getOuterDimsPerm()); + newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); + + // Replace op. + Value oldResult = op.getResult(); + Value newResult = newOp.getResult(); + Value replacement = (newResult.getType() != oldResult.getType()) + ? rewriter.create( + op->getLoc(), oldResult.getType(), newResult) + : newResult; + + rewriter.replaceOp(op, {replacement}); + + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// UnPackOp +//===----------------------------------------------------------------------===// + +void UnPackOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "unpack"); +} + +LogicalResult +UnPackOp::reifyResultShapes(OpBuilder &builder, + ReifiedRankedShapedTypeDims &reifiedReturnShapes) { + return reifyResultShapesImpl(*this, builder, reifiedReturnShapes); +} + +DenseMap UnPackOp::getDimAndTileMapping() { + return getDimAndTileMappingImpl(*this); +} + +SmallVector UnPackOp::getMixedTiles() { + return getMixedTilesImpl(*this); +} + +SmallVector UnPackOp::getStaticTiles() { + return getStaticTilesImpl(*this); +} + +ArrayRef UnPackOp::getAllOuterDims() { + ShapedType destType = getDestType(); + int64_t destRank = destType.getRank(); + return getSourceType().getShape().take_front(destRank); +} + +SmallVector UnPackOp::getTiledOuterDims() { + auto innerDimsPos = getInnerDimsPos(); + auto packedShape = getSourceType().getShape(); + SmallVector res; + + for (auto index : innerDimsPos) + res.push_back(packedShape[index]); + + return res; +} + +LogicalResult UnPackOp::verify() { + return commonVerifierPackAndUnPackOp(*this); +} + +Speculation::Speculatability UnPackOp::getSpeculatability() { + // See PackOp::getSpeculatability. + if (!areTilesAndTiledDimsAllConstant(*this)) + return Speculation::NotSpeculatable; + + return Speculation::Speculatable; +} + +void UnPackOp::build(OpBuilder &builder, OperationState &state, Value source, + Value dest, ArrayRef innerDimsPos, + ArrayRef innerTiles, + ArrayRef outerDimsPerm) { + assert(innerDimsPos.size() == innerTiles.size() && + "number of tile sizes specified must match the specified number of " + "original dimensions to be tiled"); + SmallVector staticTileSizes; + SmallVector dynamicTileSizes; + dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes); + build(builder, state, dest.getType(), source, dest, + outerDimsPerm.empty() ? nullptr + : builder.getDenseI64ArrayAttr(outerDimsPerm), + builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, + builder.getDenseI64ArrayAttr(staticTileSizes)); +} + +Value UnPackOp::createDestinationTensor(OpBuilder &b, Location loc, + Value source, + ArrayRef innerTileSizes, + ArrayRef innerDimsPos, + ArrayRef outerDimsPerm) { + AffineExpr sym0, sym1; + bindSymbols(b.getContext(), sym0, sym1); + auto dimMul = [&](OpFoldResult v1, OpFoldResult v2) -> OpFoldResult { + return affine::makeComposedFoldedAffineApply(b, loc, sym0 * sym1, {v1, v2}); + }; + + SmallVector mixedSizes; + auto srcType = llvm::cast(source.getType()); + for (auto i : + llvm::seq(0, srcType.getRank() - innerTileSizes.size())) { + if (srcType.isDynamicDim(i)) + mixedSizes.push_back(b.create(loc, source, i).getResult()); + else + mixedSizes.push_back(b.getIndexAttr(srcType.getDimSize(i))); + } + if (!outerDimsPerm.empty()) { + applyPermutationToVector( + mixedSizes, invertPermutationVector(outerDimsPerm)); + } + + for (auto [dimPos, tileSize] : llvm::zip_equal(innerDimsPos, innerTileSizes)) + mixedSizes[dimPos] = dimMul(mixedSizes[dimPos], tileSize); + + auto elemType = srcType.getElementType(); + return b.create(loc, mixedSizes, elemType); +} + +UnPackOp UnPackOp::createTransposedClone(OpBuilder &b, Location loc, + Value transposedSource, + ArrayRef innerPermutation, + ArrayRef outerPermutation) { + PackOrUnPackTransposeResult metadata = commonPermutationOfPackAndUnPackOp( + *this, innerPermutation, outerPermutation); + return b.create(loc, transposedSource, getDest(), + metadata.innerDimsPos, metadata.innerTiles, + metadata.outerDimsPerm); +} + +/// Returns true if the `srcShape` or `destShape` is different from the one in +/// `op` and populates each with the inferred static shape. +static bool inferStaticShape(UnPackOp op, SmallVectorImpl &srcShape, + SmallVectorImpl &destShape) { + bool changeNeeded = false; + srcShape.assign(op.getSourceType().getShape().begin(), + op.getSourceType().getShape().end()); + destShape.assign(op.getDestType().getShape().begin(), + op.getDestType().getShape().end()); + llvm::SmallSetVector innerDims; + innerDims.insert(op.getInnerDimsPos().begin(), op.getInnerDimsPos().end()); + SmallVector inverseOuterDimsPerm; + if (!op.getOuterDimsPerm().empty()) + inverseOuterDimsPerm = invertPermutationVector(op.getOuterDimsPerm()); + int destRank = op.getDestRank(); + for (auto i : llvm::seq(0, destRank)) { + if (innerDims.contains(i)) + continue; + int64_t srcPos = i; + int64_t destPos = i; + if (!inverseOuterDimsPerm.empty()) + srcPos = inverseOuterDimsPerm[destPos]; + if (ShapedType::isDynamic(srcShape[srcPos]) == + ShapedType::isDynamic(destShape[destPos])) { + continue; + } + int64_t size = srcShape[srcPos]; + if (ShapedType::isDynamic(size)) + size = destShape[destPos]; + srcShape[srcPos] = size; + destShape[destPos] = size; + changeNeeded = true; + } + return changeNeeded; +} + +LogicalResult UnPackOp::canonicalize(UnPackOp unPackOp, + PatternRewriter &rewriter) { + /// unpack(pack(x)) -> x + if (PackOp packOp = unPackOp.getSource().getDefiningOp()) { + if (packOp.getSourceType() != unPackOp.getDestType()) + return failure(); + if (packOp.getPaddingValue() || + !hasSameInnerOuterAttribute(packOp, unPackOp) || + !haveSameTiles(packOp, unPackOp)) + return failure(); + rewriter.replaceOp(unPackOp, packOp.getSource()); + return success(); + } + /// unpack(destinationStyleOp(x)) -> unpack(x) + if (auto dstStyleOp = + unPackOp.getDest().getDefiningOp()) { + auto destValue = cast(unPackOp.getDest()); + Value newDest = dstStyleOp.getDpsInits()[destValue.getResultNumber()]; + rewriter.modifyOpInPlace(unPackOp, + [&]() { unPackOp.setDpsInitOperand(0, newDest); }); + return success(); + } + + // Insert tensor.cast ops if static shape inference is available.. + SmallVector srcShape, destShape; + if (inferStaticShape(unPackOp, srcShape, destShape)) { + Location loc = unPackOp.getLoc(); + Value source = unPackOp.getSource(); + if (srcShape != unPackOp.getSourceType().getShape()) { + auto newSrcType = unPackOp.getSourceType().clone(srcShape); + source = rewriter.create(loc, newSrcType, + unPackOp.getSource()); + } + Value dest = unPackOp.getDest(); + if (destShape != unPackOp.getDestType().getShape()) { + auto newDestType = unPackOp.getDestType().clone(destShape); + dest = + rewriter.create(loc, newDestType, unPackOp.getDest()); + } + Value newOp = rewriter.create( + loc, source, dest, unPackOp.getInnerDimsPos(), unPackOp.getMixedTiles(), + unPackOp.getOuterDimsPerm()); + rewriter.replaceOpWithNewOp( + unPackOp, unPackOp.getResult().getType(), newOp); + return success(); + } + + return failure(); +} + +bool UnPackOp::isLikeUnPad() { + RankedTensorType packedTensorType = getSourceType(); + return isLikePadUnPad(*this, packedTensorType); +} + +OpFoldResult UnPackOp::fold(FoldAdaptor adaptor) { + if (OpFoldResult reshapedSource = reshapeConstantSource( + llvm::dyn_cast_if_present(adaptor.getSource()), + getResult().getType())) + return reshapedSource; + return {}; +} + +/// Folds a tensor.cast op into a consuming UnPackOp op if the +/// `tensor.cast` has source that is more static than the consuming op. +/// +/// Example: +/// ```mlir +/// %1 = tensor.cast %0 : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> +/// %2 = tensor.unpack %1 ... : tensor<1x1x?x1xi32> -> tensor<7x?xi32> +/// ``` +/// +/// folds into: +/// +/// ```mlir +/// %2 = tensor.unpack %0 ... tensor<1x1x8x1xi32> -> tensor<7x?xi32> +/// ``` +struct FoldTensorCastUnPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(UnPackOp op, + PatternRewriter &rewriter) const override { + if (!tensor::hasFoldableTensorCastOperand(op)) + return failure(); + + SmallVector newResultTypes(op->getResultTypes()); + SmallVector newOperands = + tensor::getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); + Value sourceTensor = newOperands[0]; + + // Get the updated mixed-tile-sizes attribute. + SmallVector newMixedTileSizes = getNewMixedTileSizes( + rewriter, sourceTensor.getType(), op.getMixedTiles()); + + // Clone op. + // TODO: Strictly speaking, discardable attributes should be _discarded_ at + // this point. However, in practice, we use them for things that we'd like + // to preserve. Implement a better abstraction. + UnPackOp newOp = rewriter.create( + op.getLoc(), sourceTensor, newOperands[1], op.getInnerDimsPos(), + newMixedTileSizes, op.getOuterDimsPerm()); + newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); + + // Replace op. + Value oldResult = op.getResult(); + Value newResult = newOp.getResult(); + Value replacement = (newResult.getType() != oldResult.getType()) + ? rewriter.create( + op->getLoc(), oldResult.getType(), newResult) + : newResult; + + rewriter.replaceOp(op, {replacement}); + + return success(); + } +}; + } // namespace linalg } // namespace mlir + +//===----------------------------------------------------------------------===// +// LinalgDialect +//===----------------------------------------------------------------------===// + +void LinalgDialect::getCanonicalizationPatterns( + RewritePatternSet &results) const { + results.add(getContext()); +} + +Operation *LinalgDialect::materializeConstant(OpBuilder &builder, + Attribute value, Type type, + Location loc) { + return arith::ConstantOp::materialize(builder, value, type, loc); +} diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 51d1df52598c7..2f54e780093a2 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -267,6 +267,16 @@ void transform::ApplyPadVectorizationPatternsOp::populatePatterns( linalg::populatePadOpVectorizationPatterns(patterns); } +void transform::ApplyFoldIntoPackAndUnpackPatternsOp::populatePatterns( + RewritePatternSet &patterns) { + linalg::populateFoldIntoPackAndUnpackPatterns(patterns); +} + +void transform::ApplyFoldPackUnpackIntoEmptyPatternsOp::populatePatterns( + RewritePatternSet &patterns) { + linalg::populateFoldPackUnpackIntoTensorEmptyPatterns(patterns); +} + //===----------------------------------------------------------------------===// // BufferizeToAllocationOp //===----------------------------------------------------------------------===// @@ -1170,7 +1180,7 @@ LogicalResult transform::InterchangeOp::verify() { //===----------------------------------------------------------------------===// DiagnosedSilenceableFailure transform::LowerPackOp::applyToOne( - transform::TransformRewriter &rewriter, tensor::PackOp target, + transform::TransformRewriter &rewriter, linalg::PackOp target, transform::ApplyToEachResultList &transformResults, transform::TransformState &state) { rewriter.setInsertionPoint(target); @@ -1192,7 +1202,7 @@ DiagnosedSilenceableFailure transform::LowerPackOp::applyToOne( //===----------------------------------------------------------------------===// DiagnosedSilenceableFailure transform::LowerUnPackOp::applyToOne( - transform::TransformRewriter &rewriter, tensor::UnPackOp target, + transform::TransformRewriter &rewriter, linalg::UnPackOp target, transform::ApplyToEachResultList &transformResults, transform::TransformState &state) { rewriter.setInsertionPoint(target); @@ -1622,7 +1632,7 @@ bool isValidPackingPermutation( RelayoutOpTy op, ArrayRef permutation, OuterOrInnerPerm outerOrInnerPerm = OuterOrInnerPerm::Outer) { static_assert( - llvm::is_one_of::value, + llvm::is_one_of::value, "applies to only pack or unpack operations"); if (!op || permutation.empty()) return true; @@ -1631,7 +1641,7 @@ bool isValidPackingPermutation( return permutation.size() == innerRank && isPermutationVector(permutation); // op.getOuterDimsPerm() may be empty, in which case it is identity. // Don't rely on it. - if (std::is_same::value) { + if (std::is_same::value) { return permutation.size() == op.getSourceRank() && isPermutationVector(permutation); } @@ -1665,11 +1675,11 @@ transform::PackTransposeOp::apply(transform::TransformRewriter &rewriter, } // Step 2.2. Fail on wrong type. - auto packOp = dyn_cast(*packOrUnpackOps.begin()); - auto unPackOp = dyn_cast(*packOrUnpackOps.begin()); + auto packOp = dyn_cast(*packOrUnpackOps.begin()); + auto unPackOp = dyn_cast(*packOrUnpackOps.begin()); if ((!packOp && !unPackOp)) { return emitSilenceableError() << "requires target to map to a " - "tensor.pack or tensor.unpack"; + "linalg.pack or linalg.unpack"; } LinalgOp linalgOpTarget = dyn_cast(*linalgOps.begin()); if (!linalgOpTarget) @@ -1694,7 +1704,7 @@ transform::PackTransposeOp::apply(transform::TransformRewriter &rewriter, assert(!packOp && "packOp must be null on entry when unPackOp is not null"); OpOperand *packUse = linalgOp.getDpsInitOperand( cast(unPackOp.getSource()).getResultNumber()); - packOp = dyn_cast_or_null(packUse->get().getDefiningOp()); + packOp = dyn_cast_or_null(packUse->get().getDefiningOp()); if (!packOp || !packOp.getResult().hasOneUse()) return emitSilenceableError() << "could not find matching pack op"; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp index 7f9a0f7a6ca43..81842e4bea631 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp @@ -88,7 +88,7 @@ static bool validateFullTilesOnDims(linalg::LinalgOp linalgOp, /// Return failure or packed matmul with one of its operands transposed. static FailureOr transposePackedMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp, - tensor::PackOp packOp, AffineMap operandMap, + linalg::PackOp packOp, AffineMap operandMap, ArrayRef blocksStartDimPos, bool transposeOuterBlocks, bool transposeInnerBlocks) { assert(operandMap.getNumDims() >= 4 && diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt index 3594b08413812..d18b6f8afc43b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt @@ -26,6 +26,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms MeshShardingInterfaceImpl.cpp NamedOpConversions.cpp BlockPackMatmul.cpp + PackAndUnpackPatterns.cpp Padding.cpp Promotion.cpp RuntimeOpVerification.cpp diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp index c906f3bdcc632..9f5000b70b6f6 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp @@ -61,7 +61,7 @@ template static FailureOr getPackingInfoFromOperand(OpOperand *opOperand, linalg::GenericOp genericOp, OpTy packOrUnPackOp) { - static_assert(llvm::is_one_of::value, + static_assert(llvm::is_one_of::value, "applies to only pack or unpack operations"); LLVM_DEBUG( { llvm::dbgs() << "--- Construct PackInfo From an operand ---\n"; }); @@ -210,7 +210,7 @@ static SmallVector computeOuterDims(ArrayRef perm, /// %4 = arith.addf %arg3, %arg4 : f32 /// linalg.yield %4 : f32 /// } -> tensor -/// %1 = tensor.pack %0 +/// %1 = linalg.pack %0 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %dest : tensor -> tensor @@ -219,7 +219,7 @@ static SmallVector computeOuterDims(ArrayRef perm, /// 8. Thus, the below operation and `affine_map<(d0, d1, d2, d3)> -> /// affine_map<(d1, d3)>` will be returned. /// -/// %pack = tensor.pack %arg0 +/// %pack = linalg.pack %arg0 /// inner_dims_pos = [0] /// inner_tiles = [8] /// into %init : tensor -> tensor @@ -290,9 +290,9 @@ getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo, if (innerDimsPos.empty() && outerDimsPerm.empty()) return std::make_tuple(opOperand->get(), indexingMap); - auto empty = tensor::PackOp::createDestinationTensor( + auto empty = linalg::PackOp::createDestinationTensor( b, loc, opOperand->get(), innerTileSizes, innerDimsPos, outerDimsPerm); - auto packedOperand = b.create( + auto packedOperand = b.create( loc, opOperand->get(), empty, innerDimsPos, innerTileSizes, /*padding=*/std::nullopt, outerDimsPerm); return std::make_tuple(packedOperand, indexingMap); @@ -327,7 +327,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, return newGenericOp; } -/// Bubbles up tensor.pack op through a producer generic op. This +/// Bubbles up linalg.pack op through a producer generic op. This /// swap pack(generic) to generic(pack). The new generic op works on packed /// domain; pack ops are created for input and output operands. E.g., /// @@ -343,7 +343,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, /// %4 = arith.addf %arg3, %arg3 : f32 /// linalg.yield %4 : f32 /// } -> tensor -/// %4 = tensor.pack %3 +/// %4 = linalg.pack %3 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %dest : tensor -> tensor @@ -358,7 +358,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, /// %0 = affine.apply #map()[%dim] /// %1 = affine.apply #map1()[%dim_0] /// %2 = tensor.empty(%0, %1) : tensor -/// %pack = tensor.pack %arg0 +/// %pack = linalg.pack %arg0 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %2 : tensor -> tensor @@ -371,7 +371,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, /// linalg.yield %4 : f32 /// } -> tensor static FailureOr -bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, +bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, linalg::PackOp packOp, const ControlPropagationFn &controlFn) { auto genericOp = packOp.getSource().getDefiningOp(); if (!genericOp) @@ -416,11 +416,11 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, rewriter.setInsertionPoint(genericOp); // We need to handle two cases: - // 1) The tensor.pack destination is a tensor.empty. If this is the case, we + // 1) The linalg.pack destination is a tensor.empty. If this is the case, we // create a new tensor.empty to avoid breaking dominance, as we are moving the - // tensor.pack above the linalg.generic. + // linalg.pack above the linalg.generic. // 2) The destination is not a tensor.empty. In this case we can replace only - // if the destination of the tensor.pack dominates the linalg.generic. + // if the destination of the linalg.pack dominates the linalg.generic. Value packOpDest = packOp.getDest(); if (!packOpDest.hasOneUse()) return failure(); @@ -453,13 +453,13 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, /// Wrapper pattern that applies bubbleUpPackOpThroughGenericOp method. struct BubbleUpPackOpThroughGenericOpPattern - : public OpRewritePattern { + : public OpRewritePattern { public: BubbleUpPackOpThroughGenericOpPattern(MLIRContext *context, ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) {} + : OpRewritePattern(context), controlFn(std::move(fun)) {} - LogicalResult matchAndRewrite(tensor::PackOp packOp, + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { auto genericOp = bubbleUpPackOpThroughGenericOp(rewriter, packOp, controlFn); @@ -473,15 +473,15 @@ struct BubbleUpPackOpThroughGenericOpPattern ControlPropagationFn controlFn; }; -/// Propagate a tensor.pack operation up through a tensor.pad. The idea is to +/// Propagate a linalg.pack operation up through a tensor.pad. The idea is to /// add as many zero padding dimensions in `high` and `low` based on the number /// of point loops. -class BubbleUpPackThroughPadOp final : public OpRewritePattern { +class BubbleUpPackThroughPadOp final : public OpRewritePattern { public: BubbleUpPackThroughPadOp(MLIRContext *context, ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) {} + : OpRewritePattern(context), controlFn(std::move(fun)) {} - LogicalResult matchAndRewrite(tensor::PackOp packOp, + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { auto padOp = packOp.getSource().getDefiningOp(); if (!padOp) @@ -522,10 +522,10 @@ class BubbleUpPackThroughPadOp final : public OpRewritePattern { ArrayRef outerDimsPerm = packOp.getOuterDimsPerm(); SmallVector mixedTiles = packOp.getMixedTiles(); - auto empty = tensor::PackOp::createDestinationTensor( + auto empty = linalg::PackOp::createDestinationTensor( rewriter, loc, padOp.getSource(), mixedTiles, innerDimsPos, outerDimsPerm); - auto sourcePack = rewriter.create( + auto sourcePack = rewriter.create( loc, padOp.getSource(), empty, innerDimsPos, mixedTiles, /*padding=*/std::nullopt, outerDimsPerm); @@ -549,9 +549,9 @@ class BubbleUpPackThroughPadOp final : public OpRewritePattern { // If the pad has more than one user, create an unpack on the new pad to // replace the other uses. if (!padOp->hasOneUse()) { - auto unpackEmpty = tensor::UnPackOp::createDestinationTensor( + auto unpackEmpty = linalg::UnPackOp::createDestinationTensor( rewriter, loc, newPadOp, mixedTiles, innerDimsPos, outerDimsPerm); - Value unpackedPad = rewriter.create( + Value unpackedPad = rewriter.create( loc, newPadOp, unpackEmpty, innerDimsPos, mixedTiles, outerDimsPerm); rewriter.replaceAllUsesExcept(padOp, unpackedPad, sourcePack); } @@ -636,20 +636,20 @@ static int64_t applyPermutationAndReindexReassoc( /// /// %collapsed = tensor.collapse_shape %in [[0, 1], 2] /// : tensor into tensor -/// %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] +/// %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] /// inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %empty /// : tensor -> tensor /// /// can be transformed into: /// -/// %pack = tensor.pack %in outer_dims_perm = [1, 2] +/// %pack = linalg.pack %in outer_dims_perm = [1, 2] /// inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %empty /// : tensor -> tensor /// %collapsed = tensor.collapse_shape %pack [[0, 1], 2, 3, 4] /// : tensor into tensor static LogicalResult bubbleUpPackOpThroughCollapseShape(tensor::CollapseShapeOp collapseOp, - tensor::PackOp packOp, + linalg::PackOp packOp, PatternRewriter &rewriter) { SmallVector innerTileSizes = packOp.getStaticTiles(); ArrayRef innerDimsPos = packOp.getInnerDimsPos(); @@ -682,10 +682,10 @@ bubbleUpPackOpThroughCollapseShape(tensor::CollapseShapeOp collapseOp, reassocIndices[outerPos].end()); } - auto emptyOp = tensor::PackOp::createDestinationTensor( + auto emptyOp = linalg::PackOp::createDestinationTensor( rewriter, packOp.getLoc(), collapseOp.getSrc(), packOp.getMixedTiles(), projectedInnerDimsPos, newOuterDimsPerm); - auto newPackOp = rewriter.create( + auto newPackOp = rewriter.create( packOp.getLoc(), collapseOp.getSrc(), emptyOp, projectedInnerDimsPos, packOp.getMixedTiles(), packOp.getPaddingValue(), newOuterDimsPerm); @@ -742,20 +742,20 @@ projectDimsPosIntoReassocPos(ArrayRef dimsPos, /// /// %expand = tensor.expand_shape %in [[0], [1, 2]] /// : tensor into tensor -/// %pack = tensor.pack %expand outer_dims_perm = [0, 1] +/// %pack = linalg.pack %expand outer_dims_perm = [0, 1] /// inner_dims_pos = [2] inner_tiles = [8] into %empty /// : tensor -> tensor /// /// can be transformed into: /// -/// %pack = tensor.pack %in outer_dims_perm = [1, 2] +/// %pack = linalg.pack %in outer_dims_perm = [1, 2] /// inner_dims_pos = [1] inner_tiles = [8] into %empty /// : tensor -> tensor /// %expand = tensor.expand_shape %pack [[0], [1, 2], [3]] /// : tensor into tensor static LogicalResult bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, - tensor::PackOp packOp, + linalg::PackOp packOp, PatternRewriter &rewriter) { // Outer dimensions permutation is not supported currently. // TODO: Handle outer_dims_perm variants. @@ -808,7 +808,7 @@ bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, // If reassociation is not possible, then reordering cannot happen. // This can be caused by pack padding affecting previously expanded // dimensions or packing extending dimensions. - RankedTensorType newPackType = tensor::PackOp::inferPackedType( + RankedTensorType newPackType = linalg::PackOp::inferPackedType( expandOp.getSrcType(), packOp.getStaticInnerTiles(), projectedInnerDimsPos, /*outerDimsPerm=*/SmallVector{}); auto reassocExpand = @@ -817,10 +817,10 @@ bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, return rewriter.notifyMatchFailure( packOp, "could not reassociate dims after bubbling up"); - Value destTensor = tensor::PackOp::createDestinationTensor( + Value destTensor = linalg::PackOp::createDestinationTensor( rewriter, packOp.getLoc(), expandOp.getSrc(), packOp.getMixedTiles(), projectedInnerDimsPos, /*outerDimsPerm=*/SmallVector{}); - Value packedVal = rewriter.create( + Value packedVal = rewriter.create( packOp.getLoc(), expandOp.getSrc(), destTensor, projectedInnerDimsPos, packOp.getMixedTiles(), packOp.getPaddingValue(), /*outerDimsPerm=*/SmallVector{}); @@ -833,12 +833,12 @@ bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, } class BubbleUpPackOpThroughReshapeOp final - : public OpRewritePattern { + : public OpRewritePattern { public: BubbleUpPackOpThroughReshapeOp(MLIRContext *context, ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) {} + : OpRewritePattern(context), controlFn(std::move(fun)) {} - LogicalResult matchAndRewrite(tensor::PackOp packOp, + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { Operation *srcOp = packOp.getSource().getDefiningOp(); // Currently only support when the pack op is the only user. @@ -877,7 +877,7 @@ class BubbleUpPackOpThroughReshapeOp final /// /// For example: /// -/// %unpack = tensor.unpack %in outer_dims_perm = [0, 1] +/// %unpack = linalg.unpack %in outer_dims_perm = [0, 1] /// inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %empty /// : tensor -> tensor /// %expanded = tensor.expand_shape %unpack [[0, 1], [2]] @@ -887,11 +887,11 @@ class BubbleUpPackOpThroughReshapeOp final /// /// %expanded = tensor.expand_shape %ain [[0, 1], [2], [3], [4]] /// : tensor into tensor -/// %unpack = tensor.unpack %expanded outer_dims_perm = [0, 1, 2] +/// %unpack = linalg.unpack %expanded outer_dims_perm = [0, 1, 2] /// inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %empty /// : tensor -> tensor static LogicalResult pushDownUnPackOpThroughExpandShape( - tensor::UnPackOp unPackOp, tensor::ExpandShapeOp expandOp, + linalg::UnPackOp unPackOp, tensor::ExpandShapeOp expandOp, PatternRewriter &rewriter, ControlPropagationFn controlFn) { // User controlled propagation function. if (!controlFn(&expandOp.getSrcMutable())) @@ -943,16 +943,16 @@ static LogicalResult pushDownUnPackOpThroughExpandShape( nextPos += 1; } - RankedTensorType newExpandType = tensor::PackOp::inferPackedType( + RankedTensorType newExpandType = linalg::PackOp::inferPackedType( expandTy, innerTileSizes, projectedInnerDimsPos, newOuterDimsPerm); auto newExpandOp = rewriter.create( expandOp.getLoc(), newExpandType, unPackOp.getSource(), newReassocIndices); - auto emptyOp = tensor::UnPackOp::createDestinationTensor( + auto emptyOp = linalg::UnPackOp::createDestinationTensor( rewriter, unPackOp.getLoc(), newExpandOp, unPackOp.getMixedTiles(), projectedInnerDimsPos, newOuterDimsPerm); - auto newUnPackOp = rewriter.create( + auto newUnPackOp = rewriter.create( unPackOp.getLoc(), newExpandOp.getResult(), emptyOp, projectedInnerDimsPos, unPackOp.getMixedTiles(), newOuterDimsPerm); rewriter.replaceOp(expandOp, newUnPackOp); @@ -961,14 +961,14 @@ static LogicalResult pushDownUnPackOpThroughExpandShape( } class PushDownUnPackOpThroughReshapeOp final - : public OpRewritePattern { + : public OpRewritePattern { public: PushDownUnPackOpThroughReshapeOp(MLIRContext *context, ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) { + : OpRewritePattern(context), controlFn(std::move(fun)) { } - LogicalResult matchAndRewrite(tensor::UnPackOp unPackOp, + LogicalResult matchAndRewrite(linalg::UnPackOp unPackOp, PatternRewriter &rewriter) const override { Value result = unPackOp.getResult(); // Currently only support unpack op with the single user. @@ -1001,7 +1001,7 @@ class PushDownUnPackOpThroughReshapeOp final static FailureOr getUnPackedOperand(GenericOp genericOp) { OpOperand *unPackedOperand = nullptr; for (OpOperand &operand : genericOp->getOpOperands()) { - auto unPackOp = operand.get().getDefiningOp(); + auto unPackOp = operand.get().getDefiningOp(); if (!unPackOp) continue; if (unPackedOperand) @@ -1013,9 +1013,9 @@ static FailureOr getUnPackedOperand(GenericOp genericOp) { return unPackedOperand; } -/// Push down a tensor.unpack op through a generic op. +/// Push down a linalg.unpack op through a generic op. /// The new generic op works on packed domain; pack ops are created for input -/// and output operands. A tensor.unpack op is inserted right after the packed +/// and output operands. A linalg.unpack op is inserted right after the packed /// generic. E.g. /// /// #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> @@ -1023,7 +1023,7 @@ static FailureOr getUnPackedOperand(GenericOp genericOp) { /// %arg0 = tensor<12x2x56x56x32xf32> // packed arg. /// /// %0 = tensor.empty() : tensor<12x56x56x64xf32> -/// %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] +/// %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] /// inner_dims_pos = [3] inner_tiles = [32] into %0 /// %2 = linalg.generic {indexing_maps = [#map], /// iterator_types = ["parallel", "parallel", "parallel", "parallel"]} @@ -1044,7 +1044,7 @@ static FailureOr getUnPackedOperand(GenericOp genericOp) { /// ^bb0(%out : f32): /// linalg.yield %out : f32 /// } -> tensor<12x2x56x56x32xf32> -/// %2 = tensor.unpack %1 outer_dims_perm = [0, 3, 1, 2] +/// %2 = linalg.unpack %1 outer_dims_perm = [0, 3, 1, 2] /// inner_dims_pos = [3] inner_tiles = [32] into %0 /// static FailureOr> @@ -1063,8 +1063,8 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, OpOperand *unPackedOperand = *(maybeUnPackedOperand); // Extract packing information. - tensor::UnPackOp producerUnPackOp = - unPackedOperand->get().getDefiningOp(); + linalg::UnPackOp producerUnPackOp = + unPackedOperand->get().getDefiningOp(); assert(producerUnPackOp && "expect a valid UnPackOp"); if (!controlFn(unPackedOperand)) @@ -1079,7 +1079,7 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, auto [packedOutOperand, packedOutIndexingMap] = getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), *packInfo, genericOp, genericOp.getDpsInitOperand(0)); - auto destPack = packedOutOperand.getDefiningOp(); + auto destPack = packedOutOperand.getDefiningOp(); // If the dps init operand of the generic is a tensor.empty, do not pack it // and forward the new tensor.empty as a destination. @@ -1108,7 +1108,7 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, // Insert an unPackOp right after the packed generic. Value unPackOpRes = rewriter - .create(genericOp.getLoc(), newResult, + .create(genericOp.getLoc(), newResult, destPack.getSource(), innerDimsPos, mixedTiles, outerDimsPerm) .getResult(); @@ -1137,7 +1137,7 @@ struct PushDownUnPackOpThroughGenericOp : public OpRewritePattern { ControlPropagationFn controlFn; }; -/// Propagate a tensor.unpack operation through a tensor.pad. The idea is to +/// Propagate a linalg.unpack operation through a tensor.pad. The idea is to /// add as many zero padding dimensions in `high` and `low` based on the number /// of point loops. struct PushDownUnPackThroughPadOp : public OpRewritePattern { @@ -1146,8 +1146,8 @@ struct PushDownUnPackThroughPadOp : public OpRewritePattern { LogicalResult matchAndRewrite(tensor::PadOp padOp, PatternRewriter &rewriter) const override { - tensor::UnPackOp unpackOp = - padOp.getSource().getDefiningOp(); + linalg::UnPackOp unpackOp = + padOp.getSource().getDefiningOp(); if (!unpackOp) return failure(); @@ -1185,12 +1185,12 @@ struct PushDownUnPackThroughPadOp : public OpRewritePattern { loc, /*result=*/Type(), unpackOp.getSource(), lowPad, highPad, paddingVal, padOp.getNofold()); - // Inject the tensor.unpack right after the packed padOp. + // Inject the linalg.unpack right after the packed padOp. Value outputUnPack = rewriter.create( loc, padOp.getResultType().getShape(), padOp.getResultType().getElementType()); - Value replacement = rewriter.create( + Value replacement = rewriter.create( loc, newPadOp.getResult(), outputUnPack, innerDimsPos, unpackOp.getMixedTiles(), outerDimsPerm); rewriter.replaceOp(padOp, replacement); diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Linalg/Transforms/PackAndUnpackPatterns.cpp similarity index 90% rename from mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp rename to mlir/lib/Dialect/Linalg/Transforms/PackAndUnpackPatterns.cpp index 3566714c6529e..0984b6988b93b 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/PackAndUnpackPatterns.cpp @@ -13,7 +13,7 @@ #include "mlir/IR/PatternMatch.h" namespace mlir { -namespace tensor { +namespace linalg { namespace { /// Returns the number of shape sizes that is either dynamic or greater than 1. @@ -201,7 +201,7 @@ struct FoldPadWithPackOp : public OpRewritePattern { LogicalResult matchAndRewrite(PackOp packOp, PatternRewriter &rewriter) const override { - auto padOp = packOp.getSource().getDefiningOp(); + auto padOp = packOp.getSource().getDefiningOp(); if (!padOp || padOp.getNofold() || !padOp.hasZeroLowPad()) return failure(); @@ -224,10 +224,11 @@ struct FoldPadWithPackOp : public OpRewritePattern { /// Fold a `unpack` -> `extract_slice` into the `unpack` since it already /// has extract_slice semantics. -struct FoldUnpackWithExtractSliceOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct FoldUnpackWithExtractSliceOp + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(ExtractSliceOp sliceOp, + LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const override { auto unpackOp = sliceOp.getSource().getDefiningOp(); if (!unpackOp) @@ -247,7 +248,7 @@ struct FoldUnpackWithExtractSliceOp : public OpRewritePattern { // Create a new empty output tensor. Type elementType = unpackOp.getDestType().getElementType(); - Value output = rewriter.create( + Value output = rewriter.create( sliceOp.getLoc(), sliceOp.getMixedSizes(), elementType); rewriter.replaceOpWithNewOp( sliceOp, unpackOp.getSource(), output, unpackOp.getInnerDimsPos(), @@ -474,6 +475,50 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp return success(); } }; + +/// tensor.empty does not define any tensor contents, so an unpadded pack +/// can be folded away. +struct FoldEmptyTensorWithPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(PackOp packOp, + PatternRewriter &rewriter) const override { + // Check for tensor.empty source. + auto emptyOp = packOp.getSource().getDefiningOp(); + if (!emptyOp) + return failure(); + + // Check for padding. + // Packing with padding cannot be simply removed. + if (packOp.getPaddingValue()) + return rewriter.notifyMatchFailure(packOp, "expects no padding value"); + + // Replace the pack directly with its destination. + rewriter.replaceOp(packOp, packOp.getDest()); + + return success(); + } +}; + +/// tensor.empty does not define any tensor contents, so an unpack +/// can be folded away. +struct FoldEmptyTensorWithUnPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(UnPackOp unPackOp, + PatternRewriter &rewriter) const override { + // Check for tensor.empty source. + auto emptyOp = unPackOp.getSource().getDefiningOp(); + if (!emptyOp) + return failure(); + + // Replace the unpack directly with its destination. + rewriter.replaceOp(unPackOp, unPackOp.getDest()); + + return success(); + } +}; + } // namespace void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns) { @@ -490,5 +535,11 @@ void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns) { patterns.getContext()); } -} // namespace tensor +void populateFoldPackUnpackIntoTensorEmptyPatterns( + RewritePatternSet &patterns) { + patterns.add( + patterns.getContext()); +} + +} // namespace linalg } // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp index b7764da26a7f4..faa7bbf9d168a 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp @@ -10,14 +10,17 @@ #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Interfaces/TilingInterface.h" +#include "mlir/Interfaces/ValueBoundsOpInterface.h" #include using namespace mlir; @@ -563,6 +566,648 @@ struct LinalgOpPartialReductionInterface } }; +template +static SmallVector getPackUnPackIterationDomain(OpTy op, + OpBuilder &builder) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + OpBuilder::InsertionGuard g(builder); + int64_t rank = (std::is_same::value) ? op.getSourceRank() + : op.getDestRank(); + OpFoldResult zero = builder.getIndexAttr(0); + OpFoldResult one = builder.getIndexAttr(1); + ReifiedRankedShapedTypeDims resultShape; + (void)reifyResultShapes(builder, op, resultShape); + SmallVector loopBounds(rank); + for (auto dim : llvm::seq(0, rank)) { + loopBounds[dim].offset = zero; + loopBounds[dim].stride = one; + loopBounds[dim].size = resultShape[0][dim]; + } + return loopBounds; +} + +static void applyPermToRange(SmallVector &offsets, + SmallVector &sizes, + ArrayRef permutation) { + if (permutation.empty()) + return; + applyPermutationToVector(offsets, permutation); + applyPermutationToVector(sizes, permutation); +} + +struct PackOpTiling + : public TilingInterface::ExternalModel { + + SmallVector getLoopIteratorTypes(Operation *op) const { + // Note that here we only consider untiled dimensions and outer tiled data + // dimensions, the inner tiled data dimensions are materialized when + // building the body of the operation. + auto packOp = cast(op); + SmallVector iteratorTypes( + packOp.getSourceRank(), utils::IteratorType::parallel); + return iteratorTypes; + } + + SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { + return getPackUnPackIterationDomain(cast(op), b); + } + + FailureOr + getTiledImplementation(Operation *op, OpBuilder &b, + ArrayRef offsets, + ArrayRef sizes) const { + auto packOp = cast(op); + Location loc = packOp.getLoc(); + + // The tiling is applied on interchanged dimensions. We have to undo the + // interchange to map sizes and offsets to the original input. + int64_t inputRank = packOp.getSourceRank(); + SmallVector origOffsets(offsets); + SmallVector origSizes(sizes); + applyPermToRange(origOffsets, origSizes, + invertPermutationVector(packOp.getOuterDimsPerm())); + + DenseMap dimAndTileMapping = + packOp.getDimAndTileMapping(); + SmallVector srcDimValues = + tensor::getMixedSizes(b, loc, packOp.getSource()); + SmallVector inputIndices, inputSizes; + for (auto dim : llvm::seq(0, inputRank)) { + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, dim1, sym; + bindDims(b.getContext(), dim0, dim1); + bindSymbols(b.getContext(), sym); + if (dimAndTileMapping.count(dim)) { + // If the data dimension is tiled, the i-th index is the product of + // offset_i and tile_i, and the i-th size is the product of sizes_i and + // tile_i. + auto avOffset = AV(dim0).bind(origOffsets[dim]); + auto avSize = AV(dim0).bind(origSizes[dim]); + auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); + inputIndices.push_back(ab.mul(avOffset, avTileSize)); + inputSizes.push_back(ab.mul(avSize, avTileSize)); + } else { + inputIndices.push_back(origOffsets[dim]); + inputSizes.push_back(origSizes[dim]); + } + + // Limit the size of the input operand for incomplete tiles. + if (packOp.getPaddingValue()) { + OpFoldResult dimSize = srcDimValues[dim]; + auto avDimSize = AV(dim0).bind(dimSize); + auto avInputIdx = AV(dim1).bind(inputIndices.back()); + inputSizes.back() = + ab.min({inputSizes.back(), ab.sub(avDimSize, avInputIdx)}); + } + } + + auto oneAttr = b.getI64IntegerAttr(1); + SmallVector strides(inputRank, oneAttr); + + SmallVector tiledOperands; + auto sourceSlice = b.create( + loc, packOp.getSource(), inputIndices, inputSizes, strides); + tiledOperands.push_back(sourceSlice); + + SmallVector outputOffsets, outputSizes; + if (failed(getResultTilePosition(op, b, 0, offsets, sizes, outputOffsets, + outputSizes))) + return {}; + + strides.append(packOp.getDestRank() - inputRank, oneAttr); + auto outSlice = b.create( + loc, packOp.getDest(), outputOffsets, outputSizes, strides); + tiledOperands.push_back(outSlice); + + if (auto val = packOp.getPaddingValue()) + tiledOperands.push_back(val); + for (auto tile : packOp.getInnerTiles()) + tiledOperands.push_back(tile); + + Operation *tiledPackOp = b.create( + loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); + + return TilingResult{ + {tiledPackOp}, + SmallVector(tiledPackOp->getResults()), + llvm::to_vector(ArrayRef{sourceSlice, outSlice})}; + } + + LogicalResult + getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes, + SmallVector &resultOffsets, + SmallVector &resultSizes) const { + // The iteration domain is over outer dimensions of packed layout. In this + // context, the outer dimensions of `resultOffsets` are `offsets`. The + // inner dimensions of `resultOffsets` are zeros because tiling is not + // applied to them. + auto packOp = cast(op); + int64_t inputRank = packOp.getSourceRank(); + int64_t outputRank = packOp.getDestRank(); + auto zeroAttr = b.getI64IntegerAttr(0); + resultOffsets.assign(offsets.begin(), offsets.end()); + resultOffsets.append(outputRank - inputRank, zeroAttr); + + ReifiedRankedShapedTypeDims outputShape; + (void)reifyResultShapes(b, packOp, outputShape); + resultSizes.assign(sizes.begin(), sizes.end()); + for (auto dataTileDim : llvm::seq(inputRank, outputRank)) + resultSizes.push_back(outputShape[0][dataTileDim]); + + return success(); + } + + FailureOr + generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes) const { + auto packOp = cast(op); + int64_t numTiles = packOp.getInnerDimsPos().size(); + + // tensor.pack op is fusible (as a producer) only if full inner tiles are + // iterated or inner dims are not tiled. Otherwise, it will generate a + // sequence of non-trivial ops (for partial tiles). + for (auto offset : offsets.take_back(numTiles)) + if (!isConstantIntValue(offset, 0)) + return failure(); + + for (auto iter : + llvm::zip_equal(packOp.getMixedTiles(), sizes.take_back(numTiles))) + if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) + return failure(); + + FailureOr tilingResult = getTiledImplementation( + op, b, offsets.drop_back(numTiles), sizes.drop_back(numTiles)); + if (failed(tilingResult)) + return failure(); + return tilingResult.value(); + } + + /// Method to return the position of iteration domain tile computed by the + /// tiled operation. In current `tensor.pack` context, the `resultOffsets` and + /// `resultSizes` only cover outer dimensions. + LogicalResult getIterationDomainTileFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes, + SmallVectorImpl &resultOffsets, + SmallVectorImpl &resultSizes) const { + if (operandNumber != 0) + return failure(); + + auto packOp = cast(op); + // It is not trivial to infer dest tile from source tile if `packOp` has + // padding semantic. + if (packOp.getPaddingValue()) + return failure(); + + Location loc = packOp.getLoc(); + + SmallVector outerDimOffsets, outerDimSizes; + DenseMap dimAndTileMapping = + packOp.getDimAndTileMapping(); + for (auto dim : llvm::seq(packOp.getSourceRank())) { + if (dimAndTileMapping.count(dim)) { + FailureOr cstSize = + ValueBoundsConstraintSet::computeConstantBound( + presburger::BoundType::UB, sizes[dim], + /*stopCondition=*/nullptr, /*closedUB=*/true); + std::optional cstInnerSize = + getConstantIntValue(dimAndTileMapping[dim]); + // Currently fusing `packOp` as consumer only expects perfect tiling + // scenario because even if without padding semantic, the `packOp` may + // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>, + // where the `tileSize` from operand of `packOp` is 5, which is not + // exactly divided by `innerTile`(=6) of `packOp`. As the result: + // 1. the first slice is extracted from (0) to (4) and inserted into + // (0,0)~(0,4) at first row. + // 2. the second slice is extracted from (5) to (9) and SHOULD BE + // respectively inserted into two rows with different length, including + // first row: (0,5) and second row (1,0)~(1,3). It is hard to coordinate + // them, thus adding below constraint to bypass them temporarily. In + // another word, we can only support tiling with consumer if the tile + // size for the producer is a multiple of the inner tile size for the + // packed dimensions at this moment. + if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) { + return failure(); + } + + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, sym; + bindDims(b.getContext(), dim0); + bindSymbols(b.getContext(), sym); + auto avOffset = AV(dim0).bind(offsets[dim]); + auto avSize = AV(dim0).bind(sizes[dim]); + auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); + outerDimOffsets.push_back(ab.floor(avOffset, avTileSize)); + outerDimSizes.push_back(ab.ceil(avSize, avTileSize)); + } else { + outerDimOffsets.push_back(offsets[dim]); + outerDimSizes.push_back(sizes[dim]); + } + } + applyPermToRange(outerDimOffsets, outerDimSizes, packOp.getOuterDimsPerm()); + resultOffsets = outerDimOffsets; + resultSizes = outerDimSizes; + return success(); + } + + /// Method to return the tiled implementation of tensor.pack as a consumer. + FailureOr getTiledImplementationFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes) const { + if (operandNumber != 0) + return failure(); + + auto packOp = cast(op); + Location loc = packOp.getLoc(); + + int64_t inputRank = packOp.getSourceRank(); + auto oneAttr = b.getI64IntegerAttr(1); + SmallVector strides(inputRank, oneAttr); + + SmallVector tiledOperands; + auto sourceSlice = b.create( + loc, packOp.getSource(), offsets, sizes, strides); + tiledOperands.push_back(sourceSlice); + + SmallVector outerDimOffsets, outerDimSizes; + if (failed(getIterationDomainTileFromOperandTile( + op, b, /*operandNumber=*/0, offsets, sizes, outerDimOffsets, + outerDimSizes))) + return failure(); + + SmallVector outputOffsets, outputSizes; + if (failed(getResultTilePosition(op, b, 0, outerDimOffsets, outerDimSizes, + outputOffsets, outputSizes))) + return failure(); + + strides.append(packOp.getDestRank() - inputRank, oneAttr); + auto outSlice = b.create( + loc, packOp.getDest(), outputOffsets, outputSizes, strides); + tiledOperands.push_back(outSlice); + + assert(!packOp.getPaddingValue() && "Expect no padding semantic"); + for (auto tile : packOp.getInnerTiles()) + tiledOperands.push_back(tile); + + Operation *tiledPackOp = b.create( + loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); + + return TilingResult{ + {tiledPackOp}, + SmallVector(tiledPackOp->getResults()), + llvm::to_vector(ArrayRef{sourceSlice, outSlice})}; + } +}; + +struct UnpackTileDimInfo { + bool isAlignedToInnerTileSize; + OpFoldResult sourceOffset; + OpFoldResult sourceSize; + OpFoldResult resultOffset; + OpFoldResult destExpandedSize; +}; + +/// Returns the needed information for tiling unpack op on `tileDim` with given +/// `tileOffset` and `tileSize`. For more details, see the comment of the +/// `getTiledImplementation`. +static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp, + int64_t tileDim, + OpFoldResult tileOffset, + OpFoldResult tileSize) { + UnpackTileDimInfo info; + Attribute zeroAttr = b.getIndexAttr(0); + Attribute oneAttr = b.getIndexAttr(1); + DenseMap dimAndTileMapping = + unpackOp.getDimAndTileMapping(); + // The dimension is not one of packed data dimension. + if (!dimAndTileMapping.count(tileDim)) { + info.isAlignedToInnerTileSize = true; + info.sourceOffset = tileOffset; + info.sourceSize = tileSize; + info.resultOffset = zeroAttr; + info.destExpandedSize = tileSize; + return info; + } + + Location loc = unpackOp.getLoc(); + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, dim1, sym0; + bindDims(b.getContext(), dim0, dim1); + bindSymbols(b.getContext(), sym0); + + OpFoldResult innerTileSize = dimAndTileMapping[tileDim]; + + info.isAlignedToInnerTileSize = false; + FailureOr cstSize = ValueBoundsConstraintSet::computeConstantBound( + presburger::BoundType::UB, tileSize, + /*stopCondition=*/nullptr, /*closedUB=*/true); + std::optional cstInnerSize = getConstantIntValue(innerTileSize); + if (!failed(cstSize) && cstInnerSize) { + if (*cstSize % *cstInnerSize == 0) + info.isAlignedToInnerTileSize = true; + + // If the tiling size equals to the inner tiling size, the outer dims are + // always 1. + if (*cstInnerSize == *cstSize) { + auto lhs = AV(dim0).bind(tileOffset); + auto rhs = AV(dim1).bind(innerTileSize); + info.sourceOffset = ab.floor(lhs, rhs); + info.sourceSize = oneAttr; + info.resultOffset = zeroAttr; + info.destExpandedSize = tileSize; + return info; + } + } + + if (info.isAlignedToInnerTileSize) { + info.sourceOffset = + ab.floor(AV(dim0).bind(tileOffset), AV(dim1).bind(innerTileSize)); + info.resultOffset = zeroAttr; + info.destExpandedSize = tileSize; + + // The ceilDiv is needed here because there could be incomplete tile even + // it is perfect tiling cases. E.g., + // %0 = unpack tensor<33x2xf32> into tensor<64xf32> + // If the tiling size is 32, there will be 3 tiles. Two of them have + // size=32; one of them have size=2. The size is represented using + // affine_min op; we need ceilDiv. + info.sourceSize = + ab.ceil(AV(dim0).bind(tileSize), AV(dim1).bind(innerTileSize)); + return info; + } + + affine::DivModValue firstCoord = affine::getDivMod( + b, loc, getValueOrCreateConstantIndexOp(b, loc, tileOffset), + getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); + OpFoldResult tileExclusiveBound = + ab.add(AV(dim0).bind(tileOffset), AV(dim1).bind(tileSize)); + affine::DivModValue lastCoord = affine::getDivMod( + b, loc, + getValueOrCreateConstantIndexOp( + b, loc, + ab.sub(AV(dim0).bind(tileExclusiveBound), AV(dim1).bind(oneAttr))), + getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); + + OpFoldResult lengthMinusOne = ab.sub(AV(dim0).bind(lastCoord.quotient), + AV(dim1).bind(firstCoord.quotient)); + info.sourceSize = + ab.add(AV(dim0).bind(lengthMinusOne), AV(dim1).bind(oneAttr)); + info.sourceOffset = firstCoord.quotient; + info.resultOffset = firstCoord.remainder; + // Do not create an Affine ops for expanded size because the affine op is too + // complicated which would trigger an issue in affine ops simplification. + info.destExpandedSize = b.createOrFold( + loc, getValueOrCreateConstantIndexOp(b, loc, info.sourceSize), + getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); + return info; +} + +struct UnPackOpTiling + : public TilingInterface::ExternalModel { + + SmallVector getLoopIteratorTypes(Operation *op) const { + auto unpackOp = cast(op); + SmallVector iteratorTypes( + unpackOp.getDestRank(), utils::IteratorType::parallel); + return iteratorTypes; + } + + SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { + return getPackUnPackIterationDomain(cast(op), b); + } + + /// There are two cases in tiling unpack ops. If the tiling size is aligned to + /// the inner tile size, the corresponding tiles of source are all complete. + /// Otherwise, there are in-complete tiles. We will need to expand the slice + /// of source for getting complete tiles. The tiled unpack op unpacks more + /// data from source, so We'll need an extract_slice op to shift and truncate + /// the output. + /// Take Nn_to_N as an example. Say that N=32, n=8, and tiling_size=15. The + /// coordinates of second tile (i.e., result[15..31]) are + /// [(1, 7), (2, 0,), (2, 1) ... (3, 6), (3, 7)]. The first row and the last + /// row are incomplete tiles. To represent the unpack op, we have to complete + /// the rows. I.e., the input coordinates would start with (1, 0); end with + /// (3, 7). In this context, the tiled unpack produces a (3 * n) elements + /// because there are 3 rows in total. Follow by a tensor.extract_slice op, we + /// can get the actual result. + FailureOr + getTiledImplementation(Operation *op, OpBuilder &b, + ArrayRef offsets, + ArrayRef sizes) const { + auto unpackOp = cast(op); + int64_t srcRank = unpackOp.getSourceRank(); + int64_t destRank = unpackOp.getDestRank(); + int64_t numInnerTiles = srcRank - destRank; + Location loc = unpackOp.getLoc(); + + // The perfect tiling case indicates that the tiling sizes are multiple of + // inner_tile_size. In this context, no extra data is needed when + // representing the tiled unpack op. + bool isPerfectTilingCase = true; + Attribute oneAttr = b.getIndexAttr(1); + SmallVector sliceSrcStrides(destRank, oneAttr); + SmallVector sliceSrcIndices, sliceSrcSizes; + SmallVector destExpandedSizes, resultOffsetsFromDest; + for (auto dim : llvm::seq(0, destRank)) { + UnpackTileDimInfo info = + getUnpackTileDimInfo(b, unpackOp, dim, offsets[dim], sizes[dim]); + if (!info.isAlignedToInnerTileSize) + isPerfectTilingCase = false; + sliceSrcIndices.push_back(info.sourceOffset); + sliceSrcSizes.push_back(info.sourceSize); + destExpandedSizes.push_back(info.destExpandedSize); + resultOffsetsFromDest.push_back(info.resultOffset); + } + + // The tiling is applied on destination dimensions. We have to apply the + // interchange on source dimensions if outer_dims_perm is set. + applyPermToRange(sliceSrcIndices, sliceSrcSizes, + unpackOp.getOuterDimsPerm()); + Attribute zeroAttr = b.getIndexAttr(0); + sliceSrcIndices.append(numInnerTiles, zeroAttr); + sliceSrcSizes.append(unpackOp.getMixedTiles()); + sliceSrcStrides.append(numInnerTiles, oneAttr); + SmallVector generatedSlices; + tensor::ExtractSliceOp sliceSource = b.create( + loc, unpackOp.getSource(), sliceSrcIndices, sliceSrcSizes, + sliceSrcStrides); + generatedSlices.push_back(sliceSource); + + SmallVector destStrides(destRank, oneAttr); + Value sliceDest; + if (isPerfectTilingCase) { + auto destSliceOp = b.create( + loc, unpackOp.getDest(), offsets, sizes, destStrides); + sliceDest = destSliceOp; + generatedSlices.push_back(destSliceOp); + } else { + sliceDest = b.create( + loc, destExpandedSizes, unpackOp.getDestType().getElementType()); + } + + SmallVector tiledOperands = {sliceSource.getResult(), sliceDest}; + for (auto tile : unpackOp.getInnerTiles()) + tiledOperands.push_back(tile); + + Operation *tiledUnpackOp = b.create( + loc, TypeRange{sliceDest.getType()}, tiledOperands, op->getAttrs()); + + if (isPerfectTilingCase) + return TilingResult{{tiledUnpackOp}, + SmallVector(tiledUnpackOp->getResults()), + generatedSlices}; + + auto extractSlice = b.create( + loc, tiledUnpackOp->getResult(0), resultOffsetsFromDest, sizes, + destStrides); + return TilingResult{ + {tiledUnpackOp}, {extractSlice.getResult()}, generatedSlices}; + } + + LogicalResult + getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes, + SmallVector &resultOffsets, + SmallVector &resultSizes) const { + resultOffsets = llvm::to_vector(offsets); + resultSizes = llvm::to_vector(sizes); + return success(); + } + + FailureOr + generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes) const { + FailureOr tilingResult = + getTiledImplementation(op, b, offsets, sizes); + if (failed(tilingResult)) + return failure(); + return tilingResult.value(); + } + + /// Method to return the position of iteration domain tile computed by the + /// tiled operation. + LogicalResult getIterationDomainTileFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes, + SmallVectorImpl &resultOffsets, + SmallVectorImpl &resultSizes) const { + auto unPackOp = cast(op); + // If the operand tile is the dest, then no adjustment is needed. + if (operandNumber == unPackOp.getDestMutable().getOperandNumber()) { + resultOffsets = llvm::to_vector(offsets); + resultSizes = llvm::to_vector(sizes); + return success(); + } + Location loc = unPackOp.getLoc(); + + int64_t numTiles = unPackOp.getInnerDimsPos().size(); + auto destOffsets = offsets.drop_back(numTiles); + auto destSizes = sizes.drop_back(numTiles); + // The tiling is applied on interchanged dimensions. We have to undo the + // interchange to map sizes and offsets to the original input. + int64_t outputRank = unPackOp.getDestRank(); + ReifiedRankedShapedTypeDims reifiedReturnShapes; + if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes))) + return failure(); + SmallVector outputMixedSizes = reifiedReturnShapes.front(); + SmallVector origOffsets(destOffsets); + SmallVector origSizes(destSizes); + applyPermToRange(origOffsets, origSizes, + invertPermutationVector(unPackOp.getOuterDimsPerm())); + + DenseMap dimAndTileMapping = + unPackOp.getDimAndTileMapping(); + + for (auto dim : llvm::seq(0, outputRank)) { + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, dim1, sym0; + bindDims(b.getContext(), dim0, dim1); + bindSymbols(b.getContext(), sym0); + if (dimAndTileMapping.count(dim)) { + // If the data dimension is tiled, the i-th index is the product of + // offset_i and tile_i, and the i-th size is the product of sizes_i and + // tile_i. The sizes must be clamped to the sizes of the unpack result. + auto avOffset = AV(dim0).bind(origOffsets[dim]); + auto avSize = AV(dim0).bind(origSizes[dim]); + auto avTileSize = AV(sym0).bind(dimAndTileMapping[dim]); + auto avResultSize = AV(dim0).bind(outputMixedSizes[dim]); + resultOffsets.push_back(ab.mul(avOffset, avTileSize)); + auto avResultOffset = AV(dim1).bind(resultOffsets.back()); + resultSizes.push_back(ab.min({ab.mul(avSize, avTileSize), + ab.sub(avResultSize, avResultOffset)})); + } else { + resultOffsets.push_back(origOffsets[dim]); + resultSizes.push_back(origSizes[dim]); + } + } + return success(); + } + + /// Method to return the tiled implementation of tensor.unpack as a consumer. + FailureOr getTiledImplementationFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes) const { + auto unPackOp = cast(op); + // tensor.unpack op is fusible (as a consumer) only if inner dims are not + // tiled. + int64_t numTiles = unPackOp.getInnerDimsPos().size(); + for (auto iter : + llvm::zip_equal(unPackOp.getMixedTiles(), sizes.take_back(numTiles))) { + if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) + return failure(); + } + + Location loc = unPackOp.getLoc(); + + // Fetch offset/size for creating the slice of the dest operand of + // unpack op. + SmallVector outputOffsets, outputSizes; + if (failed(getIterationDomainTileFromOperandTile( + op, b, /*operandNumber=*/0, offsets, sizes, outputOffsets, + outputSizes))) + return failure(); + + auto oneAttr = b.getI64IntegerAttr(1); + int64_t outputRank = unPackOp.getDestRank(); + SmallVector strides(outputRank, oneAttr); + + SmallVector tiledOperands; + // Create slice of the dest operand. + auto extractDestSlice = b.create( + loc, unPackOp.getDest(), outputOffsets, outputSizes, strides); + tiledOperands.push_back(extractDestSlice); + + SmallVector inputOffsets, inputSizes; + strides.append(unPackOp.getSourceRank() - outputRank, oneAttr); + // Create slice of the source operand. + auto extractSourceSlice = b.create( + loc, unPackOp.getSource(), offsets, sizes, strides); + tiledOperands.insert(tiledOperands.begin(), extractSourceSlice); + for (auto tile : unPackOp.getInnerTiles()) + tiledOperands.push_back(tile); + + // Create tiled unpack op. + Operation *tiledUnPackOp = + b.create(loc, TypeRange{extractDestSlice.getType()}, + tiledOperands, op->getAttrs()); + + return TilingResult{{tiledUnPackOp}, + SmallVector(tiledUnPackOp->getResults()), + llvm::to_vector(ArrayRef{ + extractSourceSlice, extractDestSlice})}; + } +}; + } // namespace template @@ -584,8 +1229,18 @@ void mlir::linalg::registerTilingInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) { registerOne(ctx); + linalg::PackOp::attachInterface(*ctx); + linalg::UnPackOp::attachInterface(*ctx); registerAll< #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" >(ctx); }); } + +void mlir::linalg::registerTilingInterfaceExternalModelsForPackUnPackOps( + DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, LinalgDialect *dialect) { + linalg::PackOp::attachInterface(*ctx); + linalg::UnPackOp::attachInterface(*ctx); + }); +} diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index 50593b08ad74b..dcd50cc44f81b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -217,7 +217,7 @@ struct PackedOperandsDimList { } // namespace FailureOr linalg::lowerPack(RewriterBase &rewriter, - tensor::PackOp packOp, + linalg::PackOp packOp, bool lowerPadLikeWithInsertSlice) { // 1. Filter out NYI cases. auto packedTensorType = @@ -238,7 +238,7 @@ FailureOr linalg::lowerPack(RewriterBase &rewriter, PackingMetadata packingMetadata = computePackingMetadata( packedTensorType.getRank(), packOp.getInnerDimsPos()); SmallVector packedToStripMinedShapePerm = - tensor::getPackInverseDestPerm(packOp); + getPackInverseDestPerm(packOp); // 3. Compute the stripMinedShape: this is the packed shape before any outer // or inner permutations have been applied. @@ -353,7 +353,7 @@ FailureOr linalg::lowerPack(RewriterBase &rewriter, } FailureOr -linalg::lowerUnPack(RewriterBase &rewriter, tensor::UnPackOp unPackOp, +linalg::lowerUnPack(RewriterBase &rewriter, linalg::UnPackOp unPackOp, bool lowerUnpadLikeWithExtractSlice) { Location loc = unPackOp->getLoc(); OpBuilder::InsertionGuard g(rewriter); @@ -388,7 +388,7 @@ linalg::lowerUnPack(RewriterBase &rewriter, tensor::UnPackOp unPackOp, // before any outer or inner permutations have been applied. PackingMetadata packingMetadata; SmallVector packedToStripMinedShapePerm = - tensor::getUnPackInverseSrcPerm(unPackOp, packingMetadata); + getUnPackInverseSrcPerm(unPackOp, packingMetadata); // 2. Compute the stripMinedShape: this is the packed shape without outer and // inner permutations. @@ -493,8 +493,8 @@ FailureOr linalg::pack(RewriterBase &rewriter, llvm::interleaveComma(iteratorTypes, DBGS() << "iterators: "); DBGSNL();); - SmallVector packOps; - SmallVector unPackOps; + SmallVector packOps; + SmallVector unPackOps; // Step 1. Pack each dim of the LinalgOp metadata by packedSizes[i]. PackedOperandsDimList listOfPackedOperandsDim; for (int64_t i = 0, e = packedSizes.size(); i < e; ++i) { @@ -545,7 +545,7 @@ FailureOr linalg::pack(RewriterBase &rewriter, inputsAndInits.push_back(operand); continue; } - Value dest = tensor::PackOp::createDestinationTensor( + Value dest = linalg::PackOp::createDestinationTensor( rewriter, loc, operand, innerPackSizes, innerPos, /*outerDimsPerm=*/{}); ShapedType operandType = cast(operand.getType()); @@ -554,11 +554,11 @@ FailureOr linalg::pack(RewriterBase &rewriter, return getConstantIntValue(tile).has_value(); }); if (areConstantTiles && operandType.hasStaticShape() && - !tensor::PackOp::requirePaddingValue( + !linalg::PackOp::requirePaddingValue( operandType.getShape(), innerPos, cast(dest.getType()).getShape(), {}, innerPackSizes)) { - packOps.push_back(rewriter.create( + packOps.push_back(rewriter.create( loc, operand, dest, innerPos, innerPackSizes)); } else { // TODO: value of the padding attribute should be determined by @@ -566,7 +566,7 @@ FailureOr linalg::pack(RewriterBase &rewriter, auto zeroAttr = rewriter.getZeroAttr(getElementTypeOrSelf(dest.getType())); Value zero = rewriter.create(loc, zeroAttr); - packOps.push_back(rewriter.create( + packOps.push_back(rewriter.create( loc, operand, dest, innerPos, innerPackSizes, zero)); } inputsAndInits.push_back(packOps.back()); @@ -586,14 +586,14 @@ FailureOr linalg::pack(RewriterBase &rewriter, // Step 4. Propagate packing to all the op results. for (OpResult result : packedLinalgOp->getResults()) { int64_t resultNum = result.getResultNumber(); - tensor::PackOp maybePackedInit = - inits[resultNum].getDefiningOp(); + linalg::PackOp maybePackedInit = + inits[resultNum].getDefiningOp(); if (!maybePackedInit) { results.push_back(result); continue; } // Build the symmetrical UnPackOp to the existing PackOp. - unPackOps.push_back(rewriter.create( + unPackOps.push_back(rewriter.create( packedLinalgOp->getLoc(), result, maybePackedInit.getSource(), maybePackedInit.getInnerDimsPos(), maybePackedInit.getMixedTiles())); results.push_back(unPackOps.back()); @@ -674,15 +674,15 @@ static LinalgOp transposeOneLinalgOperandAndReplace( } FailureOr -linalg::packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, - linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp, +linalg::packTranspose(RewriterBase &rewriter, linalg::PackOp packOp, + linalg::LinalgOp linalgOp, linalg::UnPackOp maybeUnPackOp, ArrayRef outerPerm, ArrayRef innerPerm) { Location loc = linalgOp.getLoc(); // Step 1. Transpose packOp. rewriter.setInsertionPoint(packOp); - tensor::PackOp transposedPackOp = + linalg::PackOp transposedPackOp = packOp.createTransposedClone(rewriter, loc, innerPerm, outerPerm); if (!packOp.getResult().hasOneUse()) @@ -733,7 +733,7 @@ linalg::packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, rewriter, linalgOp, packUse, permutation, transposedPackOp.getResult()); // Step 3. Maybe transpose unPackOp. - tensor::UnPackOp transposedUnPackOp; + linalg::UnPackOp transposedUnPackOp; if (maybeUnPackOp) { OpOperand &opOperand = transposedLinalgOp->getOpOperand(packUseOperandNumber); @@ -1024,7 +1024,7 @@ LogicalResult ExtractSliceOfPadTensorSwapPattern::matchAndRewrite( /// /// This method assumes that all outer dims for this pack Op are 1. static Value getPackOpSourceOrPaddedSource(OpBuilder &builder, - tensor::PackOp packOp) { + linalg::PackOp packOp) { Value input = packOp.getSource(); if (!packOp.getPaddingValue()) { return input; @@ -1141,7 +1141,7 @@ getPackUnpackRankReducedPerm(ArrayRef shape, } LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite( - tensor::PackOp packOp, PatternRewriter &rewriter) const { + linalg::PackOp packOp, PatternRewriter &rewriter) const { // TODO: support the case that outer dimensions are not all 1s. A // tensor.expand_shape will be generated in this case. if (llvm::any_of(packOp.getAllOuterDims(), @@ -1242,7 +1242,7 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite( } LogicalResult DecomposeOuterUnitDimsUnPackOpPattern::matchAndRewrite( - tensor::UnPackOp unpackOp, PatternRewriter &rewriter) const { + linalg::UnPackOp unpackOp, PatternRewriter &rewriter) const { int64_t srcRank = unpackOp.getSourceRank(); int64_t destRank = unpackOp.getDestRank(); ArrayRef srcShape = unpackOp.getSourceType().getShape(); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index f2c23c49a78e8..ae04c2b6b2a5b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1499,11 +1499,11 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state, return success(); } -/// Given a tensor::PackOp, return the `dest` shape before any packing +/// Given a linalg::PackOp, return the `dest` shape before any packing /// permutations. -static SmallVector getTiledPackShape(tensor::PackOp packOp, +static SmallVector getTiledPackShape(linalg::PackOp packOp, ArrayRef destShape) { - return applyPermutation(destShape, tensor::getPackInverseDestPerm(packOp)); + return applyPermutation(destShape, linalg::getPackInverseDestPerm(packOp)); } /// Given an input, the mixed destSizes, and the vector sizes for vectorization, @@ -1558,7 +1558,7 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc, return write; } -/// Vectorize tensor::PackOp with (1) static innerTiles (2) constant +/// Vectorize linalg::PackOp with (1) static innerTiles (2) constant /// padding value and (3) input vector sizes into: /// masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds /// As in the following example: @@ -1585,7 +1585,7 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc, /// determined by the result tensor shape. Also, we update the inBounds /// attribute instead of masking. static LogicalResult -vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, +vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, ArrayRef inputVectorSizes, SmallVectorImpl &newResults) { // TODO: Introduce a parent class that will handle the insertion point update. @@ -1639,7 +1639,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, // Create TransposeOp. auto destPermutation = - invertPermutationVector(tensor::getPackInverseDestPerm(packOp)); + invertPermutationVector(getPackInverseDestPerm(packOp)); auto transposeOp = rewriter.create( loc, shapeCastOp.getResult(), destPermutation); @@ -1651,7 +1651,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, return success(); } -/// Vectorize a `tensor::UnPackOp` to these 4 Ops: +/// Vectorize a `linalg::UnPackOp` to these 4 Ops: /// Vector::TransferReadOp - Reads a vector from the source tensor /// vector::TransposeOp - Transpose the Source tensor /// ShapeCastOp - Reshape the data based on the target. @@ -1661,7 +1661,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, /// * the vector sizes are determined by the input operand and attributes, /// * update the inBounds attribute instead of masking. static LogicalResult -vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp, +vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, ArrayRef inputVectorSizes, SmallVectorImpl &newResults) { @@ -1754,7 +1754,7 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp, PackingMetadata packMetadata; SmallVector lastDimToInsertPosPerm = - tensor::getUnPackInverseSrcPerm(unpackOp, packMetadata); + getUnPackInverseSrcPerm(unpackOp, packMetadata); ShapedType maskedOpShapedType = cast(readResult.getType()); SmallVector stripMineShape(maskedOpShapedType.getShape()); mlir::Type stripMineElemType = maskedOpShapedType.getElementType(); @@ -1887,7 +1887,7 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op, /// Need to check if the inner-tiles are static/constant. static LogicalResult -vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp, +vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp, ArrayRef inputVectorSizes) { if (llvm::any_of(unpackOp.getInnerTiles(), [](OpFoldResult res) { @@ -2007,7 +2007,7 @@ static LogicalResult vectorizeLinalgOpPrecondition( } static LogicalResult -vectorizePackOpPrecondition(tensor::PackOp packOp, +vectorizePackOpPrecondition(linalg::PackOp packOp, ArrayRef inputVectorSizes) { auto padValue = packOp.getPaddingValue(); Attribute cstAttr; @@ -2203,10 +2203,10 @@ LogicalResult mlir::linalg::vectorizeOpPrecondition( .Case([&](auto padOp) { return vectorizePadOpPrecondition(padOp, inputVectorSizes); }) - .Case([&](auto packOp) { + .Case([&](auto packOp) { return vectorizePackOpPrecondition(packOp, inputVectorSizes); }) - .Case([&](auto unpackOp) { + .Case([&](auto unpackOp) { return vectorizeUnPackOpPrecondition(unpackOp, inputVectorSizes); }) .Case([&](auto sliceOp) { @@ -2231,7 +2231,7 @@ static void convertAffineApply(RewriterBase &rewriter, LinalgOp linalgOp) { } bool mlir::linalg::hasVectorizationImpl(Operation *op) { - return isa(op); } @@ -2308,18 +2308,18 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, return vectorizeAsTensorPadOp(rewriter, padOp, inputVectorSizes, results); }) - .Case([&](auto packOp) { + .Case([&](auto packOp) { return vectorizeAsTensorPackOp(rewriter, packOp, inputVectorSizes, results); }) + .Case([&](auto unpackOp) { + return vectorizeAsTensorUnpackOp(rewriter, unpackOp, + inputVectorSizes, results); + }) .Case([&](auto sliceOp) { return vectorizeAsInsertSliceOp(rewriter, sliceOp, inputVectorSizes, results); }) - .Case([&](auto unpackOp) { - return vectorizeAsTensorUnpackOp(rewriter, unpackOp, - inputVectorSizes, results); - }) .Default([](auto) { return failure(); }); if (failed(vectorizeResult)) { diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index d148067fe6343..d3d301ca093b1 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -142,10 +142,64 @@ static void unpackRanges(OpBuilder &builder, Location loc, //===----------------------------------------------------------------------===// // General utilities //===----------------------------------------------------------------------===// +// +/// The permutation can be obtained from two permutations: +/// a) Compute the permutation vector to move the last `numPackedDims` into +/// the `innerPosDims` of a shape of rank `rank`. +/// b) Compute the permutation vector to move outer dims if the +/// `outerPerm` parameter is not empty. +/// Apply (b) permutation on (a) permutation to get the final permutation. +static SmallVector +computePackUnPackPerm(int64_t rank, ArrayRef &innerDimsPos, + ArrayRef &outerPerm, + PackingMetadata &packingMetadata) { + int64_t numPackedDims = innerDimsPos.size(); + auto lastDims = + llvm::to_vector(llvm::seq(rank - numPackedDims, rank)); + packingMetadata = computePackingMetadata(rank, innerDimsPos); + SmallVector innerPositionsPerm = + computePermutationVector(rank, lastDims, packingMetadata.insertPositions); + + SmallVector outerPos = packingMetadata.outerPositions; + if (!outerPerm.empty()) + applyPermutationToVector(outerPos, outerPerm); + SmallVector outerPositionPerm = + computePermutationVector(rank, packingMetadata.outerPositions, outerPos); + + SmallVector packInverseDestPermutation = innerPositionsPerm; + applyPermutationToVector(packInverseDestPermutation, outerPositionPerm); + return packInverseDestPermutation; +} namespace mlir { namespace linalg { +SmallVector getPackInverseDestPerm(PackOp packOp) { + + PackingMetadata pMetadata; + int64_t packedRank = packOp.getDestType().getRank(); + ArrayRef innerDimPos = packOp.getInnerDimsPos(); + ArrayRef outerPerm = packOp.getOuterDimsPerm(); + SmallVector packInvDestPerm = + computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata); + return packInvDestPerm; +} + +SmallVector getUnPackInverseSrcPerm(UnPackOp unpackOp) { + PackingMetadata metadata; + return getUnPackInverseSrcPerm(unpackOp, metadata); +} + +SmallVector getUnPackInverseSrcPerm(UnPackOp unpackOp, + PackingMetadata &metadata) { + int64_t unpackRank = unpackOp.getSourceType().getRank(); + ArrayRef innerDimPos = unpackOp.getInnerDimsPos(); + ArrayRef outerPerm = unpackOp.getOuterDimsPerm(); + SmallVector unpackInvSrcPerm = + computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata); + return unpackInvSrcPerm; +} + bool allIndexingsAreProjectedPermutation(LinalgOp op) { return llvm::all_of(op.getIndexingMapsArray(), [](AffineMap m) { return m.isProjectedPermutation(/*allowZeroInResults=*/true); diff --git a/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt b/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt index d9d09d6361a2f..5425615dac393 100644 --- a/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt @@ -16,7 +16,6 @@ add_mlir_dialect_library(MLIRTensorDialect DEPENDS MLIRTensorOpsIncGen - MLIRTensorInterfacesIncGen LINK_LIBS PUBLIC MLIRAffineDialect diff --git a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp index 002077753b132..8af087cbf0f61 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp @@ -63,7 +63,7 @@ void TensorDialect::initialize() { declarePromisedInterfaces(); declarePromisedInterface(); - declarePromisedInterfaces(); + declarePromisedInterfaces(); declarePromisedInterfaces(); } diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 03c2f3843f262..e741144647043 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -10,7 +10,9 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Complex/IR/Complex.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/ReshapeOpsUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" @@ -1156,20 +1158,6 @@ void EmptyOp::getCanonicalizationPatterns(RewritePatternSet &results, ReplaceEmptyTensorStaticShapeDims>(context); } -/// Try to remove a tensor operation if it would only reshape a constant. -/// Removes the op and replaces the constant with a new constant of the result -/// shape. When an optional cst attribute is passed, it is reshaped only if the -/// splat value matches the value in the attribute. -static OpFoldResult -reshapeConstantSource(DenseElementsAttr source, TensorType result, - std::optional cst = std::nullopt) { - if (source && source.isSplat() && result.hasStaticShape() && - (!cst.has_value() || source.getSplatValue() == cst.value())) - return source.resizeSplat(result); - - return {}; -} - //===----------------------------------------------------------------------===// // ExtractOp //===----------------------------------------------------------------------===// @@ -3885,916 +3873,6 @@ OpFoldResult SplatOp::fold(FoldAdaptor adaptor) { return SplatElementsAttr::get(getType(), {constOperand}); } -//===----------------------------------------------------------------------===// -// PackOp/UnPackOp Common -//===----------------------------------------------------------------------===// - -template -static LogicalResult -reifyResultShapesImpl(OpTy op, OpBuilder &builder, - ReifiedRankedShapedTypeDims &reifiedReturnShapes) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - int64_t destRank = op.getDestRank(); - reifiedReturnShapes.resize(1, SmallVector(destRank)); - reifiedReturnShapes[0] = - tensor::getMixedSizes(builder, op.getLoc(), op.getDest()); - return success(); -} - -template -static DenseMap getDimAndTileMappingImpl(OpTy op) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - DenseMap dimAndTileMapping; - ArrayRef dimsToTile = op.getInnerDimsPos(); - SmallVector tiles = op.getMixedTiles(); - assert(tiles.size() == dimsToTile.size() && - "tiles must match indices of dimension to block"); - // bind the dimension `i` with the tile factor. - for (auto i : llvm::seq(0, dimsToTile.size())) - dimAndTileMapping[dimsToTile[i]] = tiles[i]; - return dimAndTileMapping; -} - -template -static SmallVector getMixedTilesImpl(OpTy op) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - Builder builder(op); - SmallVector mixedInnerTiles; - unsigned dynamicValIndex = 0; - for (int64_t staticTile : op.getStaticInnerTiles()) { - if (!ShapedType::isDynamic(staticTile)) - mixedInnerTiles.push_back(builder.getI64IntegerAttr(staticTile)); - else - mixedInnerTiles.push_back(op.getInnerTiles()[dynamicValIndex++]); - } - return mixedInnerTiles; -} - -template -static SmallVector getStaticTilesImpl(OpTy op) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - SmallVector dynamicTiles; - SmallVector staticTiles; - dispatchIndexOpFoldResults(op.getMixedTiles(), dynamicTiles, staticTiles); - return staticTiles; -} - -/// Returns true if `dimsPos` is invalid. It is invalid when: -/// a) It contains duplicate. -/// b) At least one dimension is out of bound (`dimPos` is >= 0 and < rank). -/// c) The number of elements in `dimsPos` is > than `rank`. -static bool isInvalidPackingPosSpecification(ArrayRef dimsPos, - size_t rank) { - size_t dimsPosSize = dimsPos.size(); - if (dimsPosSize > rank) - return true; - DenseSet uniqued; - for (int64_t dim : dimsPos) - uniqued.insert(dim); - if (dimsPosSize != uniqued.size()) - return true; - return llvm::any_of(dimsPos, [rank](int64_t dimPos) { - return dimPos < 0 || dimPos >= static_cast(rank); - }); -} - -/// Returns true if the dimension of `sourceShape` is smaller than the dimension -/// of the `limitShape`. -static bool areAllInBound(ArrayRef sourceShape, - ArrayRef limitShape) { - assert( - sourceShape.size() == limitShape.size() && - "expected source shape rank, and limit of the shape to have same rank"); - return llvm::all_of( - llvm::zip(sourceShape, limitShape), [](std::tuple it) { - int64_t sourceExtent = std::get<0>(it); - int64_t limit = std::get<1>(it); - return ShapedType::isDynamic(sourceExtent) || - ShapedType::isDynamic(limit) || sourceExtent <= limit; - }); -} - -template -static LogicalResult commonVerifierPackAndUnPackOp(OpTy packOrUnPack) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - Operation *op = packOrUnPack.getOperation(); - - // Return true if we have a zero-value tile. - auto hasZeros = [&](ArrayRef tiles) { - return llvm::any_of( - tiles, [](OpFoldResult tile) { return isConstantIntValue(tile, 0); }); - }; - - // Verify tiles. Do not allow zero tiles. - SmallVector mixedTiles = packOrUnPack.getMixedTiles(); - if (hasZeros(mixedTiles)) - return op->emitError("invalid zero tile factor"); - - // Verify inner_dims_pos and outer_dims_perm. - RankedTensorType unpackedType = (std::is_same::value) - ? packOrUnPack.getSourceType() - : packOrUnPack.getDestType(); - size_t unpackedRank = unpackedType.getRank(); - ArrayRef innerDimsPos = packOrUnPack.getInnerDimsPos(); - ArrayRef outerDimPerm = packOrUnPack.getOuterDimsPerm(); - if (isInvalidPackingPosSpecification(innerDimsPos, unpackedRank)) - return op->emitError("invalid inner_dims_pos vector"); - if (isInvalidPackingPosSpecification(outerDimPerm, unpackedRank)) - return op->emitError("invalid outer_dims_perm vector"); - if (!outerDimPerm.empty() && outerDimPerm.size() != unpackedRank) - return op->emitError("outer_dims_perm must be a permutation or empty"); - - // Tiling factors must be less than or equal to the input rank for pack (or - // output rank for unpack), and must match the number of `inner_dims_pos`. - if (mixedTiles.size() > unpackedRank) { - return op->emitError("tiling factors must be less than or equal to the " - "input rank for pack or output rank for unpack"); - } - if (mixedTiles.size() != innerDimsPos.size()) { - return op->emitError( - "tiling factors must equal the number of dimensions to tile"); - } - - ShapedType packedType = (std::is_same::value) - ? packOrUnPack.getDestType() - : packOrUnPack.getSourceType(); - size_t packedRank = packedType.getRank(); - // Require output rank to match input rank + number of blocking factors. - size_t expectedPackedRank = unpackedRank + mixedTiles.size(); - if (expectedPackedRank != packedRank) { - return op->emitError( - "packed rank != (unpacked rank + num tiling factors), got ") - << packedRank << " != " << expectedPackedRank; - } - - // Verify result shape is greater than the minimum expected - // by the pack operation, and that the output shape - // represents full tiles. - RankedTensorType expectedPackedType = PackOp::inferPackedType( - unpackedType, packOrUnPack.getStaticTiles(), innerDimsPos, outerDimPerm); - if (!areAllInBound(expectedPackedType.getShape(), packedType.getShape())) { - return op->emitError("the shape of output is not large enough to hold the " - "packed data. Expected at least ") - << expectedPackedType << ", got " << packedType; - } - if (!llvm::all_of( - llvm::zip(packedType.getShape().take_back(mixedTiles.size()), - mixedTiles), - [](std::tuple it) { - int64_t shape = std::get<0>(it); - if (Attribute attr = - llvm::dyn_cast_if_present(std::get<1>(it))) { - IntegerAttr intAttr = dyn_cast_or_null(attr); - int64_t staticTileSize = intAttr.getValue().getSExtValue(); - return shape == staticTileSize; - } - return ShapedType::isDynamic(shape); - })) { - return op->emitError("mismatch in inner tile sizes specified and shaped of " - "tiled dimension in the packed type"); - } - return success(); -} - -namespace { -/// Subset of PackOp/UnPackOp fields used to compute the result of applying -/// various permutations to the op. -// TODO: Add linalg.transpose + pack/unpack folding patterns that just reuse -// these. These may or may not become true foldings / canonicalizations -// depending on how aggressive we want to be in automatically folding -// transposes. -struct PackOrUnPackTransposeResult { - SmallVector innerDimsPos; - SmallVector innerTiles; - SmallVector outerDimsPerm; -}; -} // namespace - -template -static PackOrUnPackTransposeResult -commonPermutationOfPackAndUnPackOp(OpTy packOrUnPackOp, - ArrayRef innerPermutation, - ArrayRef outerPermutation) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - assert((!innerPermutation.empty() || !outerPermutation.empty()) && - "some permutation must be non-empty"); - PackOrUnPackTransposeResult metadata; - metadata.innerDimsPos = - SmallVector(packOrUnPackOp.getInnerDimsPos()); - metadata.innerTiles = - SmallVector(packOrUnPackOp.getMixedTiles()); - int64_t numOuterDims = std::is_same::value - ? packOrUnPackOp.getSourceRank() - : packOrUnPackOp.getDestRank(); - metadata.outerDimsPerm = - packOrUnPackOp.getOuterDimsPerm().empty() - ? llvm::to_vector(llvm::seq(0, numOuterDims)) - : SmallVector(packOrUnPackOp.getOuterDimsPerm()); - if (!innerPermutation.empty()) { - assert(innerPermutation.size() == metadata.innerDimsPos.size() && - isPermutationVector(innerPermutation) && - "invalid inner permutation"); - applyPermutationToVector(metadata.innerDimsPos, innerPermutation); - applyPermutationToVector(metadata.innerTiles, innerPermutation); - } - if (!outerPermutation.empty()) { - assert(outerPermutation.size() == metadata.outerDimsPerm.size() && - isPermutationVector(outerPermutation) && - "invalid outer permutation"); - applyPermutationToVector(metadata.outerDimsPerm, outerPermutation); - } - return metadata; -} - -//===----------------------------------------------------------------------===// -// PackOp -//===----------------------------------------------------------------------===// - -void PackOp::getAsmResultNames(function_ref setNameFn) { - setNameFn(getResult(), "pack"); -} - -void PackOp::build(OpBuilder &builder, OperationState &state, Value source, - Value dest, ArrayRef innerDimsPos, - ArrayRef innerTiles, - std::optional paddingValue, - ArrayRef outerDimsPerm) { - assert(innerDimsPos.size() == innerTiles.size() && - "number of tile sizes specified must match the specified number of " - "original dimensions to be tiled"); - SmallVector staticTileSizes; - SmallVector dynamicTileSizes; - dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes); - build(builder, state, dest.getType(), source, dest, - paddingValue ? *paddingValue : nullptr, - outerDimsPerm.empty() ? nullptr - : builder.getDenseI64ArrayAttr(outerDimsPerm), - builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, - builder.getDenseI64ArrayAttr(staticTileSizes)); -} - -LogicalResult -PackOp::reifyResultShapes(OpBuilder &builder, - ReifiedRankedShapedTypeDims &reifiedReturnShapes) { - return reifyResultShapesImpl(*this, builder, reifiedReturnShapes); -} - -DenseMap PackOp::getDimAndTileMapping() { - return getDimAndTileMappingImpl(*this); -} - -SmallVector PackOp::getMixedTiles() { - return getMixedTilesImpl(*this); -} - -SmallVector PackOp::getStaticTiles() { - return getStaticTilesImpl(*this); -} - -ArrayRef PackOp::getAllOuterDims() { - ShapedType inputType = getSourceType(); - int64_t inputRank = inputType.getRank(); - return getDestType().getShape().take_front(inputRank); -} - -SmallVector PackOp::getTiledOuterDims() { - auto innerDimsPos = getInnerDimsPos(); - auto packedShape = getDestType().getShape(); - SmallVector res; - - for (auto index : innerDimsPos) - res.push_back(packedShape[index]); - - return res; -} - -bool PackOp::requirePaddingValue(ArrayRef inputShape, - ArrayRef innerDimsPos, - ArrayRef outputShape, - ArrayRef outerDimsPerm, - ArrayRef innerTiles) { - SmallVector outputTileSizes( - outputShape.take_front(inputShape.size())); - if (!outerDimsPerm.empty()) { - assert(outerDimsPerm.size() == outputTileSizes.size() && - "expected output and outer_dims_perm to have same size"); - applyPermutationToVector(outputTileSizes, - invertPermutationVector(outerDimsPerm)); - } - for (auto [pos, tileSize] : llvm::zip_equal(innerDimsPos, innerTiles)) { - if (ShapedType::isDynamic(inputShape[pos])) - continue; - std::optional constantTile = getConstantIntValue(tileSize); - - if (!constantTile) { - if (!ShapedType::isDynamic(outputTileSizes[pos]) && - (inputShape[pos] % outputTileSizes[pos] != 0)) - return true; - } else if (inputShape[pos] % (*constantTile) != 0) { - return true; - } - } - return false; -} - -LogicalResult PackOp::verify() { - if (failed(commonVerifierPackAndUnPackOp(*this))) - return failure(); - - // Verify padding value, and bail out if the tile does not divide the - // dimension fully. In the case of dynamic tile factors or dimensions, having - // a partial tile is undefined behavior. - auto paddingValue = getPaddingValue(); - if (paddingValue && - paddingValue.getType() != getSourceType().getElementType()) { - return emitOpError("expected padding_value has ") - << getSourceType().getElementType() - << " but got: " << paddingValue.getType(); - } - - if (!paddingValue && - requirePaddingValue(getSourceType().getShape(), getInnerDimsPos(), - getDestType().getShape(), getOuterDimsPerm(), - getMixedTiles())) { - return emitOpError( - "invalid tile factor or output size provided. Only full tiles are " - "supported when padding_value is not set"); - } - return success(); -} - -/// Converts OpFoldResults to int64_t shape entries, unconditionally mapping all -/// Value's to kDynamic, even if they are arith.constant values. -static SmallVector -asShapeWithAnyValueAsDynamic(ArrayRef ofrs) { - SmallVector result; - for (auto o : ofrs) { - // Have to do this first, as getConstantIntValue special-cases constants. - if (llvm::dyn_cast_if_present(o)) - result.push_back(ShapedType::kDynamic); - else - result.push_back(getConstantIntValue(o).value_or(ShapedType::kDynamic)); - } - return result; -} - -/// Helper for PackOp::{getResultShape,inferPackedType}. Returns the shape of -/// the packed type. Having a shared helper helps implement these two methods in -/// a way that ensures that they agree on which dimensions are dynamic. -static SmallVector getPackOpResultTypeShape( - ArrayRef sourceShape, ArrayRef innerTileSizes, - ArrayRef innerDimsPos, ArrayRef outerDimsPerm) { - SmallVector resultShape = llvm::to_vector(sourceShape); - for (auto tiledDim : llvm::enumerate(llvm::to_vector(innerDimsPos))) { - if (ShapedType::isDynamic(resultShape[tiledDim.value()])) - continue; - if (ShapedType::isDynamic(innerTileSizes[tiledDim.index()])) { - resultShape[tiledDim.value()] = ShapedType::kDynamic; - continue; - } - resultShape[tiledDim.value()] = divideCeilSigned( - resultShape[tiledDim.value()], innerTileSizes[tiledDim.index()]); - } - - // Swap tile loops if outer_dims_perm is available. - if (!outerDimsPerm.empty()) - applyPermutationToVector(resultShape, outerDimsPerm); - - // Append the inner tile dimensions. - resultShape.append(innerTileSizes.begin(), innerTileSizes.end()); - return resultShape; -} - -SmallVector PackOp::getResultShape( - OpBuilder &builder, Location loc, ArrayRef sourceDims, - ArrayRef innerTileSizes, ArrayRef innerDimsPos, - ArrayRef outerDimsPerm) { - SmallVector resultDims = llvm::to_vector(sourceDims); - - AffineExpr s0, s1; - bindSymbols(builder.getContext(), s0, s1); - AffineExpr ceilDivExpr = s0.ceilDiv(s1); - for (auto tiledDim : llvm::enumerate(llvm::to_vector(innerDimsPos))) { - resultDims[tiledDim.value()] = affine::makeComposedFoldedAffineApply( - builder, loc, ceilDivExpr, - {resultDims[tiledDim.value()], innerTileSizes[tiledDim.index()]}); - } - if (!outerDimsPerm.empty()) - applyPermutationToVector(resultDims, outerDimsPerm); - resultDims.append(innerTileSizes.begin(), innerTileSizes.end()); - - SmallVector resultTypeShape = - getPackOpResultTypeShape(asShapeWithAnyValueAsDynamic(sourceDims), - asShapeWithAnyValueAsDynamic(innerTileSizes), - innerDimsPos, outerDimsPerm); - - // Fix-up `resultDims` to ensure that they are Value's if and only if the - // result type shape says it's a dynamic dim. This is needed as callers may - // use dispatchIndexOpFoldResults on the result, and rely on exact number of - // dynamic dims returned by that. - for (unsigned i = 0; i < resultDims.size(); ++i) { - if (!ShapedType::isDynamic(resultTypeShape[i])) - continue; - resultDims[i] = - getValueOrCreateConstantIndexOp(builder, loc, resultDims[i]); - } - - return resultDims; -} - -/// Get the expected packed type based on source type, tile factors, position of -/// the inner tiles and permutation of the outer tiled loop. -RankedTensorType PackOp::inferPackedType(RankedTensorType sourceType, - ArrayRef innerTileSizes, - ArrayRef innerDimsPos, - ArrayRef outerDimsPerm) { - SmallVector resultShape = getPackOpResultTypeShape( - sourceType.getShape(), innerTileSizes, innerDimsPos, outerDimsPerm); - return RankedTensorType::get(resultShape, sourceType.getElementType()); -} - -Value PackOp::createDestinationTensor(OpBuilder &b, Location loc, Value source, - ArrayRef innerTileSizes, - ArrayRef innerDimsPos, - ArrayRef outerDimsPerm) { - AffineExpr dim0, dim1; - bindDims(b.getContext(), dim0, dim1); - auto ceilDiv = [&](OpFoldResult v1, OpFoldResult v2) -> OpFoldResult { - return affine::makeComposedFoldedAffineApply(b, loc, dim0.ceilDiv(dim1), - {v1, v2}); - }; - - SmallVector mixedSizes; - for (auto [index, value] : llvm::enumerate( - llvm::cast(source.getType()).getShape())) { - if (ShapedType::isDynamic(value)) - mixedSizes.push_back(b.create(loc, source, index).getResult()); - else - mixedSizes.push_back(b.getIndexAttr(value)); - } - for (auto it : llvm::zip(innerDimsPos, innerTileSizes)) { - int64_t dimPos = std::get<0>(it); - OpFoldResult tileSize = std::get<1>(it); - mixedSizes[dimPos] = ceilDiv(mixedSizes[dimPos], tileSize); - } - if (!outerDimsPerm.empty()) - applyPermutationToVector(mixedSizes, outerDimsPerm); - - mixedSizes.append(innerTileSizes.begin(), innerTileSizes.end()); - auto elemType = llvm::cast(source.getType()).getElementType(); - return b.create(loc, mixedSizes, elemType); -} - -PackOp PackOp::createTransposedClone(OpBuilder &b, Location loc, - ArrayRef innerPermutation, - ArrayRef outerPermutation) { - PackOrUnPackTransposeResult metadata = commonPermutationOfPackAndUnPackOp( - *this, innerPermutation, outerPermutation); - Value transposedDest = - createDestinationTensor(b, loc, getSource(), metadata.innerTiles, - metadata.innerDimsPos, metadata.outerDimsPerm); - return b.create(loc, getSource(), transposedDest, - metadata.innerDimsPos, metadata.innerTiles, - getPaddingValue(), metadata.outerDimsPerm); -} - -/// Returns true if the tiles and the tiled dims are constant. -template -bool areTilesAndTiledDimsAllConstant(OpTy op) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - ShapedType packedType = (std::is_same::value) - ? op.getDestType() - : op.getSourceType(); - SmallVector mixedTiles = op.getMixedTiles(); - for (auto [dimDest, tile] : llvm::zip( - packedType.getShape().take_back(mixedTiles.size()), mixedTiles)) { - std::optional constTileSize = getConstantIntValue(tile); - if (!constTileSize || ShapedType::isDynamic(dimDest)) - return false; - } - return true; -} - -Speculation::Speculatability PackOp::getSpeculatability() { - if (getPaddingValue()) - return Speculation::Speculatable; - - // The verifier rejects already operations if we can statically prove that the - // sizes of the tiles do not divide perfectly the dimension; thus, check only - // to have constant tiles and tiled inner dimensions. - if (!areTilesAndTiledDimsAllConstant(*this)) - return Speculation::NotSpeculatable; - - return Speculation::Speculatable; -} - -// Return true if `inner_dims_pos` and `outer_dims_perm` target the same -// dimensions for pack and unpack. -static bool hasSameInnerOuterAttribute(PackOp packOp, UnPackOp unPackOp) { - if (packOp.getInnerDimsPos() != unPackOp.getInnerDimsPos()) - return false; - if (packOp.getOuterDimsPerm() == unPackOp.getOuterDimsPerm()) - return true; - // Outer dims permutation is optional. - // To compare unbalanced pack-unpack pair, treat no permutation as equal to - // identity permutation. - return isIdentityPermutation(packOp.getOuterDimsPerm()) && - isIdentityPermutation(unPackOp.getOuterDimsPerm()); -} - -// Return true if pack and unpack have the same tiles. -// Same SSA values or same integer constants. -static bool haveSameTiles(PackOp packOp, UnPackOp unPackOp) { - auto packTiles = packOp.getMixedTiles(); - auto unPackTiles = unPackOp.getMixedTiles(); - if (packTiles.size() != unPackTiles.size()) - return false; - for (size_t i = 0, e = packTiles.size(); i < e; i++) { - if (!isEqualConstantIntOrValue(packTiles[i], unPackTiles[i])) - return false; - } - return true; -} - -/// Returns true if the pack op does not need a padding value. -static bool paddingIsNotNeeded(PackOp op) { - auto srcType = op.getSourceType(); - if (llvm::any_of(op.getInnerDimsPos(), - [&](int64_t pos) { return srcType.isDynamicDim(pos); })) - return false; - if (ShapedType::isDynamicShape(op.getStaticInnerTiles())) - return false; - return !PackOp::requirePaddingValue( - srcType.getShape(), op.getInnerDimsPos(), op.getDestType().getShape(), - op.getOuterDimsPerm(), op.getMixedTiles()); -} - -/// Returns true if the `srcShape` or `destShape` is different from the one in -/// `packOp` and populates each with the inferred static shape. -static bool inferStaticShape(PackOp packOp, SmallVectorImpl &srcShape, - SmallVectorImpl &destShape) { - bool changeNeeded = false; - srcShape.assign(packOp.getSourceType().getShape().begin(), - packOp.getSourceType().getShape().end()); - destShape.assign(packOp.getDestType().getShape().begin(), - packOp.getDestType().getShape().end()); - llvm::SmallSetVector innerDims; - innerDims.insert(packOp.getInnerDimsPos().begin(), - packOp.getInnerDimsPos().end()); - SmallVector inverseOuterDimsPerm; - if (!packOp.getOuterDimsPerm().empty()) - inverseOuterDimsPerm = invertPermutationVector(packOp.getOuterDimsPerm()); - int srcRank = packOp.getSourceRank(); - for (auto i : llvm::seq(0, srcRank)) { - if (innerDims.contains(i)) - continue; - int64_t srcPos = i; - int64_t destPos = i; - if (!inverseOuterDimsPerm.empty()) - destPos = inverseOuterDimsPerm[srcPos]; - if (ShapedType::isDynamic(srcShape[srcPos]) == - ShapedType::isDynamic(destShape[destPos])) { - continue; - } - int64_t size = srcShape[srcPos]; - if (ShapedType::isDynamic(size)) - size = destShape[destPos]; - srcShape[srcPos] = size; - destShape[destPos] = size; - changeNeeded = true; - } - return changeNeeded; -} - -LogicalResult PackOp::canonicalize(PackOp packOp, PatternRewriter &rewriter) { - // Fold an pack(unpack(x)) to x. - if (auto unPackOp = packOp.getSource().getDefiningOp()) { - if (unPackOp.getSourceType() != packOp.getDestType()) - return failure(); - if (packOp.getPaddingValue() || - !hasSameInnerOuterAttribute(packOp, unPackOp) || - !haveSameTiles(packOp, unPackOp)) - return failure(); - rewriter.replaceOp(packOp, unPackOp.getSource()); - return success(); - } - - // Fold optional PaddingValue operand away if padding is not needed. - if (packOp.getPaddingValue() && paddingIsNotNeeded(packOp)) { - rewriter.startOpModification(packOp); - packOp.getPaddingValueMutable().clear(); - rewriter.finalizeOpModification(packOp); - return success(); - } - - // Insert tensor.cast ops if static shape inference is available.. - SmallVector srcShape, destShape; - if (inferStaticShape(packOp, srcShape, destShape)) { - Location loc = packOp.getLoc(); - Value source = packOp.getSource(); - if (srcShape != packOp.getSourceType().getShape()) { - auto newSrcType = packOp.getSourceType().clone(srcShape); - source = - rewriter.create(loc, newSrcType, packOp.getSource()); - } - Value dest = packOp.getDest(); - RankedTensorType originalResultType = packOp.getDestType(); - bool needUpdateDestType = (destShape != originalResultType.getShape()); - if (needUpdateDestType) { - auto newDestType = packOp.getDestType().clone(destShape); - dest = - rewriter.create(loc, newDestType, packOp.getDest()); - } - rewriter.modifyOpInPlace(packOp, [&] { - packOp.getSourceMutable().assign(source); - packOp.getDestMutable().assign(dest); - packOp.getResult().setType(cast(dest.getType())); - }); - // Insert a cast if needed - if (needUpdateDestType) { - rewriter.setInsertionPointAfter(packOp); - auto castOp = - rewriter.create(loc, originalResultType, packOp); - rewriter.replaceAllUsesExcept(packOp, castOp, castOp); - } - return success(); - } - - return failure(); -} - -template -static bool isLikePadUnPad(PackOrUnpackOp packOp, - RankedTensorType packedTensorType) { - static_assert(std::is_same::value || - std::is_same::value, - "Function meant for pack/unpack"); - // This is a pad if packing only adds ones and we don't transpose dimensions. - - // Check that we are not transposing any dimensions. - ArrayRef innerDimsPos = packOp.getInnerDimsPos(); - int64_t numPackedDims = innerDimsPos.size(); - auto orderedDims = llvm::to_vector<4>(llvm::seq(0, numPackedDims)); - if (orderedDims != innerDimsPos) { - // Dimensions don't happen in order. - return false; - } - - ArrayRef packedShape = packedTensorType.getShape(); - int64_t packedRank = packedTensorType.getRank(); - // At this point we know that we are taking numPackedDims outer - // dimensions and pushing them all the way as the inner most dimensions. - // What's left on the outer most dimensions is, in this order: - // - the factor of the packed dimensions, then - // - the untouched dimensions - // This shifting inward of dimensions is a no-op (as opposed to a transpose) - // if all the dimensions that bubble outerward are ones. - // Therefore check that all the dimensions but the numPackedDims inner most - // ones are ones. - return llvm::all_of( - llvm::seq(0, packedRank - numPackedDims), - [&packedShape](int64_t i) { return packedShape[i] == 1; }); -} - -bool PackOp::isLikePad() { - auto packedTensorType = - llvm::cast((*this)->getResultTypes().front()); - return isLikePadUnPad(*this, packedTensorType); -} - -OpFoldResult PackOp::fold(FoldAdaptor adaptor) { - std::optional paddingValue; - if (auto pad = adaptor.getPaddingValue()) - paddingValue = pad; - if (OpFoldResult reshapedSource = reshapeConstantSource( - llvm::dyn_cast_if_present(adaptor.getSource()), - getDestType(), paddingValue)) - return reshapedSource; - return {}; -} - -//===----------------------------------------------------------------------===// -// UnPackOp -//===----------------------------------------------------------------------===// - -void UnPackOp::getAsmResultNames( - function_ref setNameFn) { - setNameFn(getResult(), "unpack"); -} - -LogicalResult -UnPackOp::reifyResultShapes(OpBuilder &builder, - ReifiedRankedShapedTypeDims &reifiedReturnShapes) { - return reifyResultShapesImpl(*this, builder, reifiedReturnShapes); -} - -DenseMap UnPackOp::getDimAndTileMapping() { - return getDimAndTileMappingImpl(*this); -} - -SmallVector UnPackOp::getMixedTiles() { - return getMixedTilesImpl(*this); -} - -SmallVector UnPackOp::getStaticTiles() { - return getStaticTilesImpl(*this); -} - -ArrayRef UnPackOp::getAllOuterDims() { - ShapedType destType = getDestType(); - int64_t destRank = destType.getRank(); - return getSourceType().getShape().take_front(destRank); -} - -SmallVector UnPackOp::getTiledOuterDims() { - auto innerDimsPos = getInnerDimsPos(); - auto packedShape = getSourceType().getShape(); - SmallVector res; - - for (auto index : innerDimsPos) - res.push_back(packedShape[index]); - - return res; -} - -LogicalResult UnPackOp::verify() { - return commonVerifierPackAndUnPackOp(*this); -} - -Speculation::Speculatability UnPackOp::getSpeculatability() { - // See PackOp::getSpeculatability. - if (!areTilesAndTiledDimsAllConstant(*this)) - return Speculation::NotSpeculatable; - - return Speculation::Speculatable; -} - -void UnPackOp::build(OpBuilder &builder, OperationState &state, Value source, - Value dest, ArrayRef innerDimsPos, - ArrayRef innerTiles, - ArrayRef outerDimsPerm) { - assert(innerDimsPos.size() == innerTiles.size() && - "number of tile sizes specified must match the specified number of " - "original dimensions to be tiled"); - SmallVector staticTileSizes; - SmallVector dynamicTileSizes; - dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes); - build(builder, state, dest.getType(), source, dest, - outerDimsPerm.empty() ? nullptr - : builder.getDenseI64ArrayAttr(outerDimsPerm), - builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, - builder.getDenseI64ArrayAttr(staticTileSizes)); -} - -Value UnPackOp::createDestinationTensor(OpBuilder &b, Location loc, - Value source, - ArrayRef innerTileSizes, - ArrayRef innerDimsPos, - ArrayRef outerDimsPerm) { - AffineExpr sym0, sym1; - bindSymbols(b.getContext(), sym0, sym1); - auto dimMul = [&](OpFoldResult v1, OpFoldResult v2) -> OpFoldResult { - return affine::makeComposedFoldedAffineApply(b, loc, sym0 * sym1, {v1, v2}); - }; - - SmallVector mixedSizes; - auto srcType = llvm::cast(source.getType()); - for (auto i : - llvm::seq(0, srcType.getRank() - innerTileSizes.size())) { - if (srcType.isDynamicDim(i)) - mixedSizes.push_back(b.create(loc, source, i).getResult()); - else - mixedSizes.push_back(b.getIndexAttr(srcType.getDimSize(i))); - } - if (!outerDimsPerm.empty()) { - applyPermutationToVector( - mixedSizes, invertPermutationVector(outerDimsPerm)); - } - - for (auto [dimPos, tileSize] : llvm::zip_equal(innerDimsPos, innerTileSizes)) - mixedSizes[dimPos] = dimMul(mixedSizes[dimPos], tileSize); - - auto elemType = srcType.getElementType(); - return b.create(loc, mixedSizes, elemType); -} - -UnPackOp UnPackOp::createTransposedClone(OpBuilder &b, Location loc, - Value transposedSource, - ArrayRef innerPermutation, - ArrayRef outerPermutation) { - PackOrUnPackTransposeResult metadata = commonPermutationOfPackAndUnPackOp( - *this, innerPermutation, outerPermutation); - return b.create(loc, transposedSource, getDest(), - metadata.innerDimsPos, metadata.innerTiles, - metadata.outerDimsPerm); -} - -/// Returns true if the `srcShape` or `destShape` is different from the one in -/// `op` and populates each with the inferred static shape. -static bool inferStaticShape(UnPackOp op, SmallVectorImpl &srcShape, - SmallVectorImpl &destShape) { - bool changeNeeded = false; - srcShape.assign(op.getSourceType().getShape().begin(), - op.getSourceType().getShape().end()); - destShape.assign(op.getDestType().getShape().begin(), - op.getDestType().getShape().end()); - llvm::SmallSetVector innerDims; - innerDims.insert(op.getInnerDimsPos().begin(), op.getInnerDimsPos().end()); - SmallVector inverseOuterDimsPerm; - if (!op.getOuterDimsPerm().empty()) - inverseOuterDimsPerm = invertPermutationVector(op.getOuterDimsPerm()); - int destRank = op.getDestRank(); - for (auto i : llvm::seq(0, destRank)) { - if (innerDims.contains(i)) - continue; - int64_t srcPos = i; - int64_t destPos = i; - if (!inverseOuterDimsPerm.empty()) - srcPos = inverseOuterDimsPerm[destPos]; - if (ShapedType::isDynamic(srcShape[srcPos]) == - ShapedType::isDynamic(destShape[destPos])) { - continue; - } - int64_t size = srcShape[srcPos]; - if (ShapedType::isDynamic(size)) - size = destShape[destPos]; - srcShape[srcPos] = size; - destShape[destPos] = size; - changeNeeded = true; - } - return changeNeeded; -} - -LogicalResult UnPackOp::canonicalize(UnPackOp unPackOp, - PatternRewriter &rewriter) { - /// unpack(pack(x)) -> x - if (PackOp packOp = unPackOp.getSource().getDefiningOp()) { - if (packOp.getSourceType() != unPackOp.getDestType()) - return failure(); - if (packOp.getPaddingValue() || - !hasSameInnerOuterAttribute(packOp, unPackOp) || - !haveSameTiles(packOp, unPackOp)) - return failure(); - rewriter.replaceOp(unPackOp, packOp.getSource()); - return success(); - } - /// unpack(destinationStyleOp(x)) -> unpack(x) - if (auto dstStyleOp = - unPackOp.getDest().getDefiningOp()) { - auto destValue = cast(unPackOp.getDest()); - Value newDest = dstStyleOp.getDpsInits()[destValue.getResultNumber()]; - rewriter.modifyOpInPlace(unPackOp, - [&]() { unPackOp.setDpsInitOperand(0, newDest); }); - return success(); - } - - // Insert tensor.cast ops if static shape inference is available.. - SmallVector srcShape, destShape; - if (inferStaticShape(unPackOp, srcShape, destShape)) { - Location loc = unPackOp.getLoc(); - Value source = unPackOp.getSource(); - if (srcShape != unPackOp.getSourceType().getShape()) { - auto newSrcType = unPackOp.getSourceType().clone(srcShape); - source = rewriter.create(loc, newSrcType, - unPackOp.getSource()); - } - Value dest = unPackOp.getDest(); - if (destShape != unPackOp.getDestType().getShape()) { - auto newDestType = unPackOp.getDestType().clone(destShape); - dest = - rewriter.create(loc, newDestType, unPackOp.getDest()); - } - Value newOp = rewriter.create( - loc, source, dest, unPackOp.getInnerDimsPos(), unPackOp.getMixedTiles(), - unPackOp.getOuterDimsPerm()); - rewriter.replaceOpWithNewOp( - unPackOp, unPackOp.getResult().getType(), newOp); - return success(); - } - - return failure(); -} - -bool UnPackOp::isLikeUnPad() { - RankedTensorType packedTensorType = getSourceType(); - return isLikePadUnPad(*this, packedTensorType); -} - -OpFoldResult UnPackOp::fold(FoldAdaptor adaptor) { - if (OpFoldResult reshapedSource = reshapeConstantSource( - llvm::dyn_cast_if_present(adaptor.getSource()), - getResult().getType())) - return reshapedSource; - return {}; -} - //===----------------------------------------------------------------------===// // Common Canonicalizers and Folders. //===----------------------------------------------------------------------===// @@ -4809,151 +3887,6 @@ bool foldTensorCastPrecondition(DestinationStyleOpInterface op) { return hasFoldableTensorCastOperand(op); } -// Given the (potentially) updated packed type, `newPackedTy`, generates an -// updated mixed-tile-sizes attribute. A tile size is updated only -// when: -// * a dim from newPackedTy is static, and -// * the corresponding size from mixedTiles is still dynamic. -// Otherwise, the original tile size is preserved. -// Note - packed-type-dim and mixed-tile-size should always match! -static SmallVector -getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy, - SmallVector mixedTiles) { - SmallVector newMixedTileSizes; - for (auto it : llvm::zip(cast(newPackedTy) - .getShape() - .take_back(mixedTiles.size()), - mixedTiles)) { - int64_t shape = std::get<0>(it); - if (shape == ShapedType::kDynamic) { - newMixedTileSizes.push_back(std::get<1>(it)); - continue; - } - - // If the current result dim is static, update the dynamic mixed-size - // (provided the original value is dynamic). - OpFoldResult tile = std::get<1>(it); - if (Attribute attr = llvm::dyn_cast_if_present(tile)) { - // Already a constant - newMixedTileSizes.push_back(tile); - } else { - assert(getConstantIntValue(tile).value() == shape && - "tile size and dim size don't match!"); - newMixedTileSizes.push_back( - (rewriter.getIntegerAttr(rewriter.getIndexType(), shape))); - } - } - - return newMixedTileSizes; -} - -/// Folds a tensor.cast op into a consuming tensor::PackOp op if the -/// `tensor.cast` has source that is more static than the consuming op. -/// -/// Example: -/// ```mlir -/// %1 = tensor.cast %0 : tensor<8x16xf32> to tensor -/// %2 = tensor.pack %1 ... : tensor ... -/// ``` -/// -/// folds into: -/// -/// ```mlir -/// %2 = tensor.pack %0 ... : tensor<8x16xf32> ... -/// ``` -struct FoldTensorCastPackOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(PackOp op, - PatternRewriter &rewriter) const override { - if (!foldTensorCastPrecondition(op)) - return failure(); - - SmallVector newResultTypes(op->getResultTypes()); - SmallVector newOperands = - getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); - - // Get the updated mixed-tile-sizes attribute. - SmallVector newMixedTileSizes = - getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles()); - - // Clone op. - // TODO: Strictly speaking, discardable attributes should be _discarded_ at - // this point. However, in practice, we use them for things that we'd like - // to preserve. Implement a better abstraction. - PackOp newOp = rewriter.create( - op.getLoc(), newOperands[0], newOperands[1], op.getInnerDimsPos(), - newMixedTileSizes, op.getPaddingValue(), op.getOuterDimsPerm()); - newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); - - // Replace op. - Value oldResult = op.getResult(); - Value newResult = newOp.getResult(); - Value replacement = (newResult.getType() != oldResult.getType()) - ? rewriter.create( - op->getLoc(), oldResult.getType(), newResult) - : newResult; - - rewriter.replaceOp(op, {replacement}); - - return success(); - } -}; - -/// Folds a tensor.cast op into a consuming tensor::UnPackOp op if the -/// `tensor.cast` has source that is more static than the consuming op. -/// -/// Example: -/// ```mlir -/// %1 = tensor.cast %0 : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> -/// %2 = tensor.unpack %1 ... : tensor<1x1x?x1xi32> -> tensor<7x?xi32> -/// ``` -/// -/// folds into: -/// -/// ```mlir -/// %2 = tensor.unpack %0 ... tensor<1x1x8x1xi32> -> tensor<7x?xi32> -/// ``` -struct FoldTensorCastUnPackOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(UnPackOp op, - PatternRewriter &rewriter) const override { - if (!foldTensorCastPrecondition(op)) - return failure(); - - SmallVector newResultTypes(op->getResultTypes()); - SmallVector newOperands = - getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); - Value sourceTensor = newOperands[0]; - - // Get the updated mixed-tile-sizes attribute. - SmallVector newMixedTileSizes = getNewMixedTileSizes( - rewriter, sourceTensor.getType(), op.getMixedTiles()); - - // Clone op. - // TODO: Strictly speaking, discardable attributes should be _discarded_ at - // this point. However, in practice, we use them for things that we'd like - // to preserve. Implement a better abstraction. - UnPackOp newOp = rewriter.create( - op.getLoc(), sourceTensor, newOperands[1], op.getInnerDimsPos(), - newMixedTileSizes, op.getOuterDimsPerm()); - newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); - - // Replace op. - Value oldResult = op.getResult(); - Value newResult = newOp.getResult(); - Value replacement = (newResult.getType() != oldResult.getType()) - ? rewriter.create( - op->getLoc(), oldResult.getType(), newResult) - : newResult; - - rewriter.replaceOp(op, {replacement}); - - return success(); - } -}; - /// Folds a tensor.cast op into a consuming DestinationStyleOpInterface op if /// the `tensor.cast` has source that is more static than the consuming op. /// @@ -4978,9 +3911,10 @@ struct FoldTensorCastProducerOp LogicalResult matchAndRewrite(DestinationStyleOpInterface op, PatternRewriter &rewriter) const override { - // Reject tensor::PackOp - there's dedicated pattern for that instead. + // Reject PackOp/UnpackOp (i.e. RelayoutOps) - there are dedicated patterns + // for that instead. if (!foldTensorCastPrecondition(op) || - isa(*op)) + isa(*op)) return failure(); SmallVector newResultTypes(op->getResultTypes()); @@ -5013,8 +3947,6 @@ struct FoldTensorCastProducerOp void TensorDialect::getCanonicalizationPatterns( RewritePatternSet &results) const { - results.add(getContext()); - results.add(getContext()); results.add(getContext()); } diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp index 052dee402b79e..138e4be6b18e9 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @@ -87,648 +87,6 @@ struct PadOpTiling : public TilingInterface::ExternalModel { } }; -template -static SmallVector getPackUnPackIterationDomain(OpTy op, - OpBuilder &builder) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - OpBuilder::InsertionGuard g(builder); - int64_t rank = (std::is_same::value) ? op.getSourceRank() - : op.getDestRank(); - OpFoldResult zero = builder.getIndexAttr(0); - OpFoldResult one = builder.getIndexAttr(1); - ReifiedRankedShapedTypeDims resultShape; - (void)reifyResultShapes(builder, op, resultShape); - SmallVector loopBounds(rank); - for (auto dim : llvm::seq(0, rank)) { - loopBounds[dim].offset = zero; - loopBounds[dim].stride = one; - loopBounds[dim].size = resultShape[0][dim]; - } - return loopBounds; -} - -static void applyPermToRange(SmallVector &offsets, - SmallVector &sizes, - ArrayRef permutation) { - if (permutation.empty()) - return; - applyPermutationToVector(offsets, permutation); - applyPermutationToVector(sizes, permutation); -} - -struct PackOpTiling - : public TilingInterface::ExternalModel { - - SmallVector getLoopIteratorTypes(Operation *op) const { - // Note that here we only consider untiled dimensions and outer tiled data - // dimensions, the inner tiled data dimensions are materialized when - // building the body of the operation. - auto packOp = cast(op); - SmallVector iteratorTypes( - packOp.getSourceRank(), utils::IteratorType::parallel); - return iteratorTypes; - } - - SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { - return getPackUnPackIterationDomain(cast(op), b); - } - - FailureOr - getTiledImplementation(Operation *op, OpBuilder &b, - ArrayRef offsets, - ArrayRef sizes) const { - auto packOp = cast(op); - Location loc = packOp.getLoc(); - - // The tiling is applied on interchanged dimensions. We have to undo the - // interchange to map sizes and offsets to the original input. - int64_t inputRank = packOp.getSourceRank(); - SmallVector origOffsets(offsets); - SmallVector origSizes(sizes); - applyPermToRange(origOffsets, origSizes, - invertPermutationVector(packOp.getOuterDimsPerm())); - - DenseMap dimAndTileMapping = - packOp.getDimAndTileMapping(); - SmallVector srcDimValues = - tensor::getMixedSizes(b, loc, packOp.getSource()); - SmallVector inputIndices, inputSizes; - for (auto dim : llvm::seq(0, inputRank)) { - using AV = affine::AffineValueExpr; - affine::AffineBuilder ab(b, loc); - AffineExpr dim0, dim1, sym; - bindDims(b.getContext(), dim0, dim1); - bindSymbols(b.getContext(), sym); - if (dimAndTileMapping.count(dim)) { - // If the data dimension is tiled, the i-th index is the product of - // offset_i and tile_i, and the i-th size is the product of sizes_i and - // tile_i. - auto avOffset = AV(dim0).bind(origOffsets[dim]); - auto avSize = AV(dim0).bind(origSizes[dim]); - auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); - inputIndices.push_back(ab.mul(avOffset, avTileSize)); - inputSizes.push_back(ab.mul(avSize, avTileSize)); - } else { - inputIndices.push_back(origOffsets[dim]); - inputSizes.push_back(origSizes[dim]); - } - - // Limit the size of the input operand for incomplete tiles. - if (packOp.getPaddingValue()) { - OpFoldResult dimSize = srcDimValues[dim]; - auto avDimSize = AV(dim0).bind(dimSize); - auto avInputIdx = AV(dim1).bind(inputIndices.back()); - inputSizes.back() = - ab.min({inputSizes.back(), ab.sub(avDimSize, avInputIdx)}); - } - } - - auto oneAttr = b.getI64IntegerAttr(1); - SmallVector strides(inputRank, oneAttr); - - SmallVector tiledOperands; - auto sourceSlice = b.create( - loc, packOp.getSource(), inputIndices, inputSizes, strides); - tiledOperands.push_back(sourceSlice); - - SmallVector outputOffsets, outputSizes; - if (failed(getResultTilePosition(op, b, 0, offsets, sizes, outputOffsets, - outputSizes))) - return {}; - - strides.append(packOp.getDestRank() - inputRank, oneAttr); - auto outSlice = b.create( - loc, packOp.getDest(), outputOffsets, outputSizes, strides); - tiledOperands.push_back(outSlice); - - if (auto val = packOp.getPaddingValue()) - tiledOperands.push_back(val); - for (auto tile : packOp.getInnerTiles()) - tiledOperands.push_back(tile); - - Operation *tiledPackOp = b.create( - loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); - - return TilingResult{ - {tiledPackOp}, - SmallVector(tiledPackOp->getResults()), - llvm::to_vector(ArrayRef{sourceSlice, outSlice})}; - } - - LogicalResult - getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, - ArrayRef offsets, - ArrayRef sizes, - SmallVector &resultOffsets, - SmallVector &resultSizes) const { - // The iteration domain is over outer dimensions of packed layout. In this - // context, the outer dimensions of `resultOffsets` are `offsets`. The - // inner dimensions of `resultOffsets` are zeros because tiling is not - // applied to them. - auto packOp = cast(op); - int64_t inputRank = packOp.getSourceRank(); - int64_t outputRank = packOp.getDestRank(); - auto zeroAttr = b.getI64IntegerAttr(0); - resultOffsets.assign(offsets.begin(), offsets.end()); - resultOffsets.append(outputRank - inputRank, zeroAttr); - - ReifiedRankedShapedTypeDims outputShape; - (void)reifyResultShapes(b, packOp, outputShape); - resultSizes.assign(sizes.begin(), sizes.end()); - for (auto dataTileDim : llvm::seq(inputRank, outputRank)) - resultSizes.push_back(outputShape[0][dataTileDim]); - - return success(); - } - - FailureOr - generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, - ArrayRef offsets, - ArrayRef sizes) const { - auto packOp = cast(op); - int64_t numTiles = packOp.getInnerDimsPos().size(); - - // tensor.pack op is fusible (as a producer) only if full inner tiles are - // iterated or inner dims are not tiled. Otherwise, it will generate a - // sequence of non-trivial ops (for partial tiles). - for (auto offset : offsets.take_back(numTiles)) - if (!isConstantIntValue(offset, 0)) - return failure(); - - for (auto iter : - llvm::zip_equal(packOp.getMixedTiles(), sizes.take_back(numTiles))) - if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) - return failure(); - - FailureOr tilingResult = getTiledImplementation( - op, b, offsets.drop_back(numTiles), sizes.drop_back(numTiles)); - if (failed(tilingResult)) - return failure(); - return tilingResult.value(); - } - - /// Method to return the position of iteration domain tile computed by the - /// tiled operation. In current `tensor.pack` context, the `resultOffsets` and - /// `resultSizes` only cover outer dimensions. - LogicalResult getIterationDomainTileFromOperandTile( - Operation *op, OpBuilder &b, unsigned operandNumber, - ArrayRef offsets, ArrayRef sizes, - SmallVectorImpl &resultOffsets, - SmallVectorImpl &resultSizes) const { - if (operandNumber != 0) - return failure(); - - auto packOp = cast(op); - // It is not trivial to infer dest tile from source tile if `packOp` has - // padding semantic. - if (packOp.getPaddingValue()) - return failure(); - - Location loc = packOp.getLoc(); - - SmallVector outerDimOffsets, outerDimSizes; - DenseMap dimAndTileMapping = - packOp.getDimAndTileMapping(); - for (auto dim : llvm::seq(packOp.getSourceRank())) { - if (dimAndTileMapping.count(dim)) { - FailureOr cstSize = - ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::UB, sizes[dim], - /*stopCondition=*/nullptr, /*closedUB=*/true); - std::optional cstInnerSize = - getConstantIntValue(dimAndTileMapping[dim]); - // Currently fusing `packOp` as consumer only expects perfect tiling - // scenario because even if without padding semantic, the `packOp` may - // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>, - // where the `tileSize` from operand of `packOp` is 5, which is not - // exactly divided by `innerTile`(=6) of `packOp`. As the result: - // 1. the first slice is extracted from (0) to (4) and inserted into - // (0,0)~(0,4) at first row. - // 2. the second slice is extracted from (5) to (9) and SHOULD BE - // respectively inserted into two rows with different length, including - // first row: (0,5) and second row (1,0)~(1,3). It is hard to coordinate - // them, thus adding below constraint to bypass them temporarily. In - // another word, we can only support tiling with consumer if the tile - // size for the producer is a multiple of the inner tile size for the - // packed dimensions at this moment. - if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) { - return failure(); - } - - using AV = affine::AffineValueExpr; - affine::AffineBuilder ab(b, loc); - AffineExpr dim0, sym; - bindDims(b.getContext(), dim0); - bindSymbols(b.getContext(), sym); - auto avOffset = AV(dim0).bind(offsets[dim]); - auto avSize = AV(dim0).bind(sizes[dim]); - auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); - outerDimOffsets.push_back(ab.floor(avOffset, avTileSize)); - outerDimSizes.push_back(ab.ceil(avSize, avTileSize)); - } else { - outerDimOffsets.push_back(offsets[dim]); - outerDimSizes.push_back(sizes[dim]); - } - } - applyPermToRange(outerDimOffsets, outerDimSizes, packOp.getOuterDimsPerm()); - resultOffsets = outerDimOffsets; - resultSizes = outerDimSizes; - return success(); - } - - /// Method to return the tiled implementation of tensor.pack as a consumer. - FailureOr getTiledImplementationFromOperandTile( - Operation *op, OpBuilder &b, unsigned operandNumber, - ArrayRef offsets, ArrayRef sizes) const { - if (operandNumber != 0) - return failure(); - - auto packOp = cast(op); - Location loc = packOp.getLoc(); - - int64_t inputRank = packOp.getSourceRank(); - auto oneAttr = b.getI64IntegerAttr(1); - SmallVector strides(inputRank, oneAttr); - - SmallVector tiledOperands; - auto sourceSlice = b.create(loc, packOp.getSource(), - offsets, sizes, strides); - tiledOperands.push_back(sourceSlice); - - SmallVector outerDimOffsets, outerDimSizes; - if (failed(getIterationDomainTileFromOperandTile( - op, b, /*operandNumber=*/0, offsets, sizes, outerDimOffsets, - outerDimSizes))) - return failure(); - - SmallVector outputOffsets, outputSizes; - if (failed(getResultTilePosition(op, b, 0, outerDimOffsets, outerDimSizes, - outputOffsets, outputSizes))) - return failure(); - - strides.append(packOp.getDestRank() - inputRank, oneAttr); - auto outSlice = b.create( - loc, packOp.getDest(), outputOffsets, outputSizes, strides); - tiledOperands.push_back(outSlice); - - assert(!packOp.getPaddingValue() && "Expect no padding semantic"); - for (auto tile : packOp.getInnerTiles()) - tiledOperands.push_back(tile); - - Operation *tiledPackOp = b.create( - loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); - - return TilingResult{ - {tiledPackOp}, - SmallVector(tiledPackOp->getResults()), - llvm::to_vector(ArrayRef{sourceSlice, outSlice})}; - } -}; - -struct UnpackTileDimInfo { - bool isAlignedToInnerTileSize; - OpFoldResult sourceOffset; - OpFoldResult sourceSize; - OpFoldResult resultOffset; - OpFoldResult destExpandedSize; -}; - -/// Returns the needed information for tiling unpack op on `tileDim` with given -/// `tileOffset` and `tileSize`. For more details, see the comment of the -/// `getTiledImplementation`. -static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp, - int64_t tileDim, - OpFoldResult tileOffset, - OpFoldResult tileSize) { - UnpackTileDimInfo info; - Attribute zeroAttr = b.getIndexAttr(0); - Attribute oneAttr = b.getIndexAttr(1); - DenseMap dimAndTileMapping = - unpackOp.getDimAndTileMapping(); - // The dimension is not one of packed data dimension. - if (!dimAndTileMapping.count(tileDim)) { - info.isAlignedToInnerTileSize = true; - info.sourceOffset = tileOffset; - info.sourceSize = tileSize; - info.resultOffset = zeroAttr; - info.destExpandedSize = tileSize; - return info; - } - - Location loc = unpackOp.getLoc(); - using AV = affine::AffineValueExpr; - affine::AffineBuilder ab(b, loc); - AffineExpr dim0, dim1, sym0; - bindDims(b.getContext(), dim0, dim1); - bindSymbols(b.getContext(), sym0); - - OpFoldResult innerTileSize = dimAndTileMapping[tileDim]; - - info.isAlignedToInnerTileSize = false; - FailureOr cstSize = ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::UB, tileSize, - /*stopCondition=*/nullptr, /*closedUB=*/true); - std::optional cstInnerSize = getConstantIntValue(innerTileSize); - if (!failed(cstSize) && cstInnerSize) { - if (*cstSize % *cstInnerSize == 0) - info.isAlignedToInnerTileSize = true; - - // If the tiling size equals to the inner tiling size, the outer dims are - // always 1. - if (*cstInnerSize == *cstSize) { - auto lhs = AV(dim0).bind(tileOffset); - auto rhs = AV(dim1).bind(innerTileSize); - info.sourceOffset = ab.floor(lhs, rhs); - info.sourceSize = oneAttr; - info.resultOffset = zeroAttr; - info.destExpandedSize = tileSize; - return info; - } - } - - if (info.isAlignedToInnerTileSize) { - info.sourceOffset = - ab.floor(AV(dim0).bind(tileOffset), AV(dim1).bind(innerTileSize)); - info.resultOffset = zeroAttr; - info.destExpandedSize = tileSize; - - // The ceilDiv is needed here because there could be incomplete tile even - // it is perfect tiling cases. E.g., - // %0 = unpack tensor<33x2xf32> into tensor<64xf32> - // If the tiling size is 32, there will be 3 tiles. Two of them have - // size=32; one of them have size=2. The size is represented using - // affine_min op; we need ceilDiv. - info.sourceSize = - ab.ceil(AV(dim0).bind(tileSize), AV(dim1).bind(innerTileSize)); - return info; - } - - affine::DivModValue firstCoord = affine::getDivMod( - b, loc, getValueOrCreateConstantIndexOp(b, loc, tileOffset), - getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); - OpFoldResult tileExclusiveBound = - ab.add(AV(dim0).bind(tileOffset), AV(dim1).bind(tileSize)); - affine::DivModValue lastCoord = affine::getDivMod( - b, loc, - getValueOrCreateConstantIndexOp( - b, loc, - ab.sub(AV(dim0).bind(tileExclusiveBound), AV(dim1).bind(oneAttr))), - getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); - - OpFoldResult lengthMinusOne = ab.sub(AV(dim0).bind(lastCoord.quotient), - AV(dim1).bind(firstCoord.quotient)); - info.sourceSize = - ab.add(AV(dim0).bind(lengthMinusOne), AV(dim1).bind(oneAttr)); - info.sourceOffset = firstCoord.quotient; - info.resultOffset = firstCoord.remainder; - // Do not create an Affine ops for expanded size because the affine op is too - // complicated which would trigger an issue in affine ops simplification. - info.destExpandedSize = b.createOrFold( - loc, getValueOrCreateConstantIndexOp(b, loc, info.sourceSize), - getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); - return info; -} - -struct UnPackOpTiling - : public TilingInterface::ExternalModel { - - SmallVector getLoopIteratorTypes(Operation *op) const { - auto unpackOp = cast(op); - SmallVector iteratorTypes( - unpackOp.getDestRank(), utils::IteratorType::parallel); - return iteratorTypes; - } - - SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { - return getPackUnPackIterationDomain(cast(op), b); - } - - /// There are two cases in tiling unpack ops. If the tiling size is aligned to - /// the inner tile size, the corresponding tiles of source are all complete. - /// Otherwise, there are in-complete tiles. We will need to expand the slice - /// of source for getting complete tiles. The tiled unpack op unpacks more - /// data from source, so We'll need an extract_slice op to shift and truncate - /// the output. - /// Take Nn_to_N as an example. Say that N=32, n=8, and tiling_size=15. The - /// coordinates of second tile (i.e., result[15..31]) are - /// [(1, 7), (2, 0,), (2, 1) ... (3, 6), (3, 7)]. The first row and the last - /// row are incomplete tiles. To represent the unpack op, we have to complete - /// the rows. I.e., the input coordinates would start with (1, 0); end with - /// (3, 7). In this context, the tiled unpack produces a (3 * n) elements - /// because there are 3 rows in total. Follow by a tensor.extract_slice op, we - /// can get the actual result. - FailureOr - getTiledImplementation(Operation *op, OpBuilder &b, - ArrayRef offsets, - ArrayRef sizes) const { - auto unpackOp = cast(op); - int64_t srcRank = unpackOp.getSourceRank(); - int64_t destRank = unpackOp.getDestRank(); - int64_t numInnerTiles = srcRank - destRank; - Location loc = unpackOp.getLoc(); - - // The perfect tiling case indicates that the tiling sizes are multiple of - // inner_tile_size. In this context, no extra data is needed when - // representing the tiled unpack op. - bool isPerfectTilingCase = true; - Attribute oneAttr = b.getIndexAttr(1); - SmallVector sliceSrcStrides(destRank, oneAttr); - SmallVector sliceSrcIndices, sliceSrcSizes; - SmallVector destExpandedSizes, resultOffsetsFromDest; - for (auto dim : llvm::seq(0, destRank)) { - UnpackTileDimInfo info = - getUnpackTileDimInfo(b, unpackOp, dim, offsets[dim], sizes[dim]); - if (!info.isAlignedToInnerTileSize) - isPerfectTilingCase = false; - sliceSrcIndices.push_back(info.sourceOffset); - sliceSrcSizes.push_back(info.sourceSize); - destExpandedSizes.push_back(info.destExpandedSize); - resultOffsetsFromDest.push_back(info.resultOffset); - } - - // The tiling is applied on destination dimensions. We have to apply the - // interchange on source dimensions if outer_dims_perm is set. - applyPermToRange(sliceSrcIndices, sliceSrcSizes, - unpackOp.getOuterDimsPerm()); - Attribute zeroAttr = b.getIndexAttr(0); - sliceSrcIndices.append(numInnerTiles, zeroAttr); - sliceSrcSizes.append(unpackOp.getMixedTiles()); - sliceSrcStrides.append(numInnerTiles, oneAttr); - SmallVector generatedSlices; - ExtractSliceOp sliceSource = - b.create(loc, unpackOp.getSource(), sliceSrcIndices, - sliceSrcSizes, sliceSrcStrides); - generatedSlices.push_back(sliceSource); - - SmallVector destStrides(destRank, oneAttr); - Value sliceDest; - if (isPerfectTilingCase) { - auto destSliceOp = b.create(loc, unpackOp.getDest(), - offsets, sizes, destStrides); - sliceDest = destSliceOp; - generatedSlices.push_back(destSliceOp); - } else { - sliceDest = b.create(loc, destExpandedSizes, - unpackOp.getDestType().getElementType()); - } - - SmallVector tiledOperands = {sliceSource.getResult(), sliceDest}; - for (auto tile : unpackOp.getInnerTiles()) - tiledOperands.push_back(tile); - - Operation *tiledUnpackOp = b.create( - loc, TypeRange{sliceDest.getType()}, tiledOperands, op->getAttrs()); - - if (isPerfectTilingCase) - return TilingResult{{tiledUnpackOp}, - SmallVector(tiledUnpackOp->getResults()), - generatedSlices}; - - auto extractSlice = - b.create(loc, tiledUnpackOp->getResult(0), - resultOffsetsFromDest, sizes, destStrides); - return TilingResult{ - {tiledUnpackOp}, {extractSlice.getResult()}, generatedSlices}; - } - - LogicalResult - getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, - ArrayRef offsets, - ArrayRef sizes, - SmallVector &resultOffsets, - SmallVector &resultSizes) const { - resultOffsets = llvm::to_vector(offsets); - resultSizes = llvm::to_vector(sizes); - return success(); - } - - FailureOr - generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, - ArrayRef offsets, - ArrayRef sizes) const { - FailureOr tilingResult = - getTiledImplementation(op, b, offsets, sizes); - if (failed(tilingResult)) - return failure(); - return tilingResult.value(); - } - - /// Method to return the position of iteration domain tile computed by the - /// tiled operation. - LogicalResult getIterationDomainTileFromOperandTile( - Operation *op, OpBuilder &b, unsigned operandNumber, - ArrayRef offsets, ArrayRef sizes, - SmallVectorImpl &resultOffsets, - SmallVectorImpl &resultSizes) const { - auto unPackOp = cast(op); - // If the operand tile is the dest, then no adjustment is needed. - if (operandNumber == unPackOp.getDestMutable().getOperandNumber()) { - resultOffsets = llvm::to_vector(offsets); - resultSizes = llvm::to_vector(sizes); - return success(); - } - Location loc = unPackOp.getLoc(); - - int64_t numTiles = unPackOp.getInnerDimsPos().size(); - auto destOffsets = offsets.drop_back(numTiles); - auto destSizes = sizes.drop_back(numTiles); - // The tiling is applied on interchanged dimensions. We have to undo the - // interchange to map sizes and offsets to the original input. - int64_t outputRank = unPackOp.getDestRank(); - ReifiedRankedShapedTypeDims reifiedReturnShapes; - if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes))) - return failure(); - SmallVector outputMixedSizes = reifiedReturnShapes.front(); - SmallVector origOffsets(destOffsets); - SmallVector origSizes(destSizes); - applyPermToRange(origOffsets, origSizes, - invertPermutationVector(unPackOp.getOuterDimsPerm())); - - DenseMap dimAndTileMapping = - unPackOp.getDimAndTileMapping(); - - for (auto dim : llvm::seq(0, outputRank)) { - using AV = affine::AffineValueExpr; - affine::AffineBuilder ab(b, loc); - AffineExpr dim0, dim1, sym0; - bindDims(b.getContext(), dim0, dim1); - bindSymbols(b.getContext(), sym0); - if (dimAndTileMapping.count(dim)) { - // If the data dimension is tiled, the i-th index is the product of - // offset_i and tile_i, and the i-th size is the product of sizes_i and - // tile_i. The sizes must be clamped to the sizes of the unpack result. - auto avOffset = AV(dim0).bind(origOffsets[dim]); - auto avSize = AV(dim0).bind(origSizes[dim]); - auto avTileSize = AV(sym0).bind(dimAndTileMapping[dim]); - auto avResultSize = AV(dim0).bind(outputMixedSizes[dim]); - resultOffsets.push_back(ab.mul(avOffset, avTileSize)); - auto avResultOffset = AV(dim1).bind(resultOffsets.back()); - resultSizes.push_back(ab.min({ab.mul(avSize, avTileSize), - ab.sub(avResultSize, avResultOffset)})); - } else { - resultOffsets.push_back(origOffsets[dim]); - resultSizes.push_back(origSizes[dim]); - } - } - return success(); - } - - /// Method to return the tiled implementation of tensor.unpack as a consumer. - FailureOr getTiledImplementationFromOperandTile( - Operation *op, OpBuilder &b, unsigned operandNumber, - ArrayRef offsets, ArrayRef sizes) const { - auto unPackOp = cast(op); - // tensor.unpack op is fusible (as a consumer) only if inner dims are not - // tiled. - int64_t numTiles = unPackOp.getInnerDimsPos().size(); - for (auto iter : - llvm::zip_equal(unPackOp.getMixedTiles(), sizes.take_back(numTiles))) { - if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) - return failure(); - } - - Location loc = unPackOp.getLoc(); - - // Fetch offset/size for creating the slice of the dest operand of - // unpack op. - SmallVector outputOffsets, outputSizes; - if (failed(getIterationDomainTileFromOperandTile( - op, b, /*operandNumber=*/0, offsets, sizes, outputOffsets, - outputSizes))) - return failure(); - - auto oneAttr = b.getI64IntegerAttr(1); - int64_t outputRank = unPackOp.getDestRank(); - SmallVector strides(outputRank, oneAttr); - - SmallVector tiledOperands; - // Create slice of the dest operand. - auto extractDestSlice = b.create( - loc, unPackOp.getDest(), outputOffsets, outputSizes, strides); - tiledOperands.push_back(extractDestSlice); - - SmallVector inputOffsets, inputSizes; - strides.append(unPackOp.getSourceRank() - outputRank, oneAttr); - // Create slice of the source operand. - auto extractSourceSlice = b.create( - loc, unPackOp.getSource(), offsets, sizes, strides); - tiledOperands.insert(tiledOperands.begin(), extractSourceSlice); - for (auto tile : unPackOp.getInnerTiles()) - tiledOperands.push_back(tile); - - // Create tiled unpack op. - Operation *tiledUnPackOp = - b.create(loc, TypeRange{extractDestSlice.getType()}, - tiledOperands, op->getAttrs()); - - return TilingResult{{tiledUnPackOp}, - SmallVector(tiledUnPackOp->getResults()), - llvm::to_vector(ArrayRef{ - extractSourceSlice, extractDestSlice})}; - } -}; - } // namespace FailureOr tensor::bubbleUpPadSlice(OpBuilder &b, @@ -949,15 +307,5 @@ void mlir::tensor::registerTilingInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) { tensor::PadOp::attachInterface(*ctx); - tensor::PackOp::attachInterface(*ctx); - tensor::UnPackOp::attachInterface(*ctx); - }); -} - -void mlir::tensor::registerTilingInterfaceExternalModelsForPackUnPackOps( - DialectRegistry ®istry) { - registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) { - tensor::PackOp::attachInterface(*ctx); - tensor::UnPackOp::attachInterface(*ctx); }); } diff --git a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp index 99199252710f9..f3560d08ff769 100644 --- a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp +++ b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp @@ -100,11 +100,6 @@ void transform::ApplyFoldTensorEmptyPatternsOp::populatePatterns( tensor::populateFoldTensorEmptyPatterns(patterns, getFoldSingleUseOnly()); } -void transform::ApplyFoldIntoPackAndUnpackPatternsOp::populatePatterns( - RewritePatternSet &patterns) { - tensor::populateFoldIntoPackAndUnpackPatterns(patterns); -} - void transform::ApplyFoldTensorSubsetOpsPatternsOp::populatePatterns( RewritePatternSet &patterns) { tensor::populateFoldTensorSubsetOpPatterns(patterns); diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt index cc6275fee671a..7880d1c5a0c5d 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt @@ -6,7 +6,6 @@ add_mlir_dialect_library(MLIRTensorTransforms FoldTensorSubsetOps.cpp IndependenceTransforms.cpp MergeConsecutiveInsertExtractSlicePatterns.cpp - PackAndUnpackPatterns.cpp ReshapePatterns.cpp RewriteAsConstant.cpp SwapExtractSliceWithProducerPatterns.cpp diff --git a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp index 60b0c3e759b6c..fa748cf01977f 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp @@ -93,49 +93,6 @@ struct FoldEmptyTensorWithExtractSliceOp bool foldSingleUseOnly = false; }; -/// tensor.empty does not define any tensor contents, so an unpadded pack -/// can be folded away. -struct FoldEmptyTensorWithPackOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(PackOp packOp, - PatternRewriter &rewriter) const override { - // Check for tensor.empty source. - auto emptyOp = packOp.getSource().getDefiningOp(); - if (!emptyOp) - return failure(); - - // Check for padding. - // Packing with padding cannot be simply removed. - if (packOp.getPaddingValue()) - return rewriter.notifyMatchFailure(packOp, "expects no padding value"); - - // Replace the pack directly with its destination. - rewriter.replaceOp(packOp, packOp.getDest()); - - return success(); - } -}; - -/// tensor.empty does not define any tensor contents, so an unpack -/// can be folded away. -struct FoldEmptyTensorWithUnPackOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(UnPackOp unPackOp, - PatternRewriter &rewriter) const override { - // Check for tensor.empty source. - auto emptyOp = unPackOp.getSource().getDefiningOp(); - if (!emptyOp) - return failure(); - - // Replace the unpack directly with its destination. - rewriter.replaceOp(unPackOp, unPackOp.getDest()); - - return success(); - } -}; - // Fold concat operation where all the operands are empty. struct FoldConcatsOfEmpty : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -176,7 +133,6 @@ void mlir::tensor::populateFoldTensorEmptyPatterns(RewritePatternSet &patterns, FoldEmptyTensorWithReshapeOp, FoldEmptyTensorWithReshapeOp>( patterns.getContext(), /*benefit=*/1, foldSingleUseOnly); - patterns.add(patterns.getContext(), - /*benefit=*/1); + patterns.add(patterns.getContext(), + /*benefit=*/1); } diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp index 5c16e538ac242..52462aae4bc80 100644 --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -92,61 +92,6 @@ mlir::tensor::computeTransposedType(RankedTensorType rankedTensorType, return transposedTensorType; } -/// The permutation can be obtained from two permutations: -/// a) Compute the permutation vector to move the last `numPackedDims` into -/// the `innerPosDims` of a shape of rank `rank`. -/// b) Compute the permutation vector to move outer dims if the -/// `outerPerm` parameter is not empty. -/// Apply (b) permutation on (a) permutation to get the final permutation. -static SmallVector -computePackUnPackPerm(int64_t rank, ArrayRef &innerDimsPos, - ArrayRef &outerPerm, - PackingMetadata &packingMetadata) { - int64_t numPackedDims = innerDimsPos.size(); - auto lastDims = - llvm::to_vector(llvm::seq(rank - numPackedDims, rank)); - packingMetadata = computePackingMetadata(rank, innerDimsPos); - SmallVector innerPositionsPerm = - computePermutationVector(rank, lastDims, packingMetadata.insertPositions); - - SmallVector outerPos = packingMetadata.outerPositions; - if (!outerPerm.empty()) - applyPermutationToVector(outerPos, outerPerm); - SmallVector outerPositionPerm = - computePermutationVector(rank, packingMetadata.outerPositions, outerPos); - - SmallVector packInverseDestPermutation = innerPositionsPerm; - applyPermutationToVector(packInverseDestPermutation, outerPositionPerm); - return packInverseDestPermutation; -} - -SmallVector mlir::tensor::getPackInverseDestPerm(PackOp packOp) { - - PackingMetadata pMetadata; - int64_t packedRank = packOp.getDestType().getRank(); - ArrayRef innerDimPos = packOp.getInnerDimsPos(); - ArrayRef outerPerm = packOp.getOuterDimsPerm(); - SmallVector packInvDestPerm = - computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata); - return packInvDestPerm; -} - -SmallVector mlir::tensor::getUnPackInverseSrcPerm(UnPackOp unpackOp) { - PackingMetadata metadata; - return mlir::tensor::getUnPackInverseSrcPerm(unpackOp, metadata); -} - -SmallVector -mlir::tensor::getUnPackInverseSrcPerm(UnPackOp unpackOp, - PackingMetadata &metadata) { - int64_t unpackRank = unpackOp.getSourceType().getRank(); - ArrayRef innerDimPos = unpackOp.getInnerDimsPos(); - ArrayRef outerPerm = unpackOp.getOuterDimsPerm(); - SmallVector unpackInvSrcPerm = - computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata); - return unpackInvSrcPerm; -} - bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) { llvm::SmallBitVector droppedDims = op.getDroppedDims(); int64_t srcDim = 0; diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp index 70b2aaf9a17e0..0336423c57b1d 100644 --- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp +++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp @@ -483,3 +483,13 @@ PackingMetadata mlir::computePackingMetadata(int64_t packedRank, } return res; } + +OpFoldResult mlir::reshapeConstantSource(DenseElementsAttr source, + TensorType result, + std::optional cst) { + if (source && source.isSplat() && result.hasStaticShape() && + (!cst.has_value() || source.getSplatValue() == cst.value())) + return source.resizeSplat(result); + + return {}; +} diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir index 01ca4374da046..4ba4b09f52163 100644 --- a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir +++ b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir @@ -38,64 +38,64 @@ func.func @block_matmul_transpose_b( // MMT4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> // MMT4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> // MMT4D-LABEL: func @block_matmul -// MMT4D-COUNT-3: tensor.pack +// MMT4D-COUNT-3: linalg.pack // MMT4D: linalg.generic // MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MMT4D-COUNT-1: tensor.unpack +// MMT4D-COUNT-1: linalg.unpack // MMT4D-LABEL: func @block_matmul_transpose_a -// MMT4D-COUNT-3: tensor.pack +// MMT4D-COUNT-3: linalg.pack // MMT4D: linalg.generic // MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MMT4D-COUNT-1: tensor.unpack +// MMT4D-COUNT-1: linalg.unpack // MMT4D-LABEL: func @block_matmul_transpose_b -// MMT4D-COUNT-3: tensor.pack +// MMT4D-COUNT-3: linalg.pack // MMT4D: linalg.generic // MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MMT4D-COUNT-1: tensor.unpack +// MMT4D-COUNT-1: linalg.unpack // MM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> // MM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)> // MM4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> // MM4D-LABEL: func @block_matmul -// MM4D-COUNT-3: tensor.pack +// MM4D-COUNT-3: linalg.pack // MM4D: linalg.generic // MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MM4D-COUNT-1: tensor.unpack +// MM4D-COUNT-1: linalg.unpack // MM4D-LABEL: func @block_matmul_transpose_a -// MM4D-COUNT-3: tensor.pack +// MM4D-COUNT-3: linalg.pack // MM4D: linalg.generic // MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MM4D-COUNT-1: tensor.unpack +// MM4D-COUNT-1: linalg.unpack // MM4D-LABEL: func @block_matmul_transpose_b -// MM4D-COUNT-3: tensor.pack +// MM4D-COUNT-3: linalg.pack // MM4D: linalg.generic // MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MM4D-COUNT-1: tensor.unpack +// MM4D-COUNT-1: linalg.unpack // MTM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d5, d3)> // MTM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)> // MTM4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> // MTM4D-LABEL: func @block_matmul -// MTM4D-COUNT-3: tensor.pack +// MTM4D-COUNT-3: linalg.pack // MTM4D: linalg.generic // MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MTM4D-COUNT-1: tensor.unpack +// MTM4D-COUNT-1: linalg.unpack // MTM4D-LABEL: func @block_matmul_transpose_a -// MTM4D-COUNT-3: tensor.pack +// MTM4D-COUNT-3: linalg.pack // MTM4D: linalg.generic // MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MTM4D-COUNT-1: tensor.unpack +// MTM4D-COUNT-1: linalg.unpack // MTM4D-LABEL: func @block_matmul_transpose_b -// MTM4D-COUNT-3: tensor.pack +// MTM4D-COUNT-3: linalg.pack // MTM4D: linalg.generic // MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MTM4D-COUNT-1: tensor.unpack +// MTM4D-COUNT-1: linalg.unpack diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir index 9e396ba08d246..e667879ceea0e 100644 --- a/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir +++ b/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir @@ -21,17 +21,17 @@ func.func @block_matmul_padding( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32> // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<123x125xf32> -> tensor<4x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<125x124xf32> -> tensor<8x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<123x124xf32> -> tensor<4x8x32x16xf32> @@ -39,17 +39,17 @@ func.func @block_matmul_padding( // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<123x124xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<123x124xf32> // NOPAD-LABEL: func @block_matmul_padding( // NOPAD-SAME: %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32> -// NOPAD-NOT: tensor.pack +// NOPAD-NOT: linalg.pack // NOPAD: linalg.matmul ins(%[[A]], %[[B]] : tensor<123x125xf32>, tensor<125x124xf32>) // NOPAD-SAME: outs(%[[C]] : tensor<123x124xf32>) -> tensor<123x124xf32> -// NOPAD-NOT: tensor.unpack +// NOPAD-NOT: linalg.unpack // PAD-MULT-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> // PAD-MULT-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> @@ -58,17 +58,17 @@ func.func @block_matmul_padding( // PAD-MULT-SAME: %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32> // PAD-MULT-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32 // PAD-MULT: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<1x1x256x384xf32> -// PAD-MULT: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// PAD-MULT: %[[A_PACKED:.+]] = linalg.pack %[[A]] // PAD-MULT-SAME: padding_value(%[[ZERO]] : f32) // PAD-MULT-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [256, 384] // PAD-MULT-SAME: into %[[PACK_DST_0]] : tensor<123x125xf32> -> tensor<1x1x256x384xf32> // PAD-MULT: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<1x1x512x384xf32> -// PAD-MULT: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// PAD-MULT: %[[B_PACKED:.+]] = linalg.pack %[[B]] // PAD-MULT-SAME: padding_value(%[[ZERO]] : f32) // PAD-MULT-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [512, 384] // PAD-MULT-SAME: into %[[PACK_DST_1]] : tensor<125x124xf32> -> tensor<1x1x512x384xf32> // PAD-MULT: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<1x1x256x512xf32> -// PAD-MULT: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// PAD-MULT: %[[C_PACKED:.+]] = linalg.pack %[[C]] // PAD-MULT-SAME: padding_value(%[[ZERO]] : f32) // PAD-MULT-SAME: inner_dims_pos = [0, 1] inner_tiles = [256, 512] // PAD-MULT-SAME: into %[[PACK_DST_2]] : tensor<123x124xf32> -> tensor<1x1x256x512xf32> @@ -76,7 +76,7 @@ func.func @block_matmul_padding( // PAD-MULT-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // PAD-MULT-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // PAD-MULT-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<1x1x256x384xf32>, tensor<1x1x512x384xf32>) outs(%[[C_PACKED]] : tensor<1x1x256x512xf32>) -// PAD-MULT: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// PAD-MULT: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // PAD-MULT-SAME: inner_dims_pos = [0, 1] inner_tiles = [256, 512] // PAD-MULT-SAME: into %[[C]] : tensor<1x1x256x512xf32> -> tensor<123x124xf32> // PAD-MULT: return %[[RES_UNPACKED]] : tensor<123x124xf32> diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir index 8a82608177692..aa860dbd581a9 100644 --- a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir +++ b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir @@ -14,22 +14,22 @@ func.func @block_matmul( // CHECK-LABEL: func @block_matmul( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>, %[[C:[0-9a-z]+]]: tensor<128x128xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x128xf32> -> tensor<4x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x128xf32> -> tensor<8x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<128x128xf32> -> tensor<4x8x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32> @@ -60,7 +60,7 @@ func.func @block_matmul_dynamic( // CHECK-DAG: %[[A_OUTER_TILE_M:.+]] = affine.apply #[[$MAP_M]]()[%[[A_M]]] // CHECK-DAG: %[[A_OUTER_TILE_K:.+]] = affine.apply #[[$MAP_K]]()[%[[A_K]]] // CHECK: %[[PACK_DST_0:.+]] = tensor.empty(%[[A_OUTER_TILE_M]], %[[A_OUTER_TILE_K]]) : tensor -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor -> tensor @@ -69,7 +69,7 @@ func.func @block_matmul_dynamic( // CHECK-DAG: %[[B_OUTER_TILE_K:.+]] = affine.apply #[[$MAP_K]]()[%[[B_K]]] // CHECK-DAG: %[[B_OUTER_TILE_N:.+]] = affine.apply #[[$MAP_N]]()[%[[B_N]]] // CHECK: %[[PACK_DST_1:.+]] = tensor.empty(%[[B_OUTER_TILE_N]], %[[B_OUTER_TILE_K]]) : tensor -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor -> tensor @@ -78,7 +78,7 @@ func.func @block_matmul_dynamic( // CHECK-DAG: %[[C_OUTER_TILE_M:.+]] = affine.apply #[[$MAP_M]]()[%[[C_M]]] // CHECK-DAG: %[[C_OUTER_TILE_N:.+]] = affine.apply #[[$MAP_N]]()[%[[C_N]]] // CHECK: %[[PACK_DST_2:.+]] = tensor.empty(%[[C_OUTER_TILE_M]], %[[C_OUTER_TILE_N]]) : tensor -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor -> tensor @@ -86,7 +86,7 @@ func.func @block_matmul_dynamic( // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor, tensor) outs(%[[C_PACKED]] : tensor) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor -> tensor // CHECK: return %[[RES_UNPACKED]] : tensor @@ -107,7 +107,7 @@ func.func @block_matmul_with_constant( // CHECK-DAG: %[[RES_DST:.+]] = arith.constant dense<0.000000e+00> : tensor<128x128xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: ins({{.*}} : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[CST_ACC_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[RES_DST]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32> @@ -130,7 +130,7 @@ func.func @block_matmul_with_producer( // CHECK: %[[ACC_PACKED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[FILL_DST_PACKED]] : tensor<4x8x32x16xf32>) -> tensor<4x8x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: ins({{.*}} : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[ACC_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32> @@ -152,7 +152,7 @@ func.func @block_matmul_with_consumer( // CHECK-DAG: %[[RES_DST:.+]] = tensor.empty() : tensor<128x128xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: outs({{.*}} : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: %[[ADD_RES:.+]] = linalg.add @@ -175,22 +175,22 @@ func.func @block_batch_matmul( // CHECK-LABEL: func @block_batch_matmul( // CHECK-SAME: %[[A:.+]]: tensor<512x64x128xf32>, %[[B:.+]]: tensor<512x128x64xf32>, %[[C:.+]]: tensor<512x64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x64x128xf32> -> tensor<512x2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x128x64xf32> -> tensor<512x4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32> @@ -211,22 +211,22 @@ func.func @block_matmul_transpose_a( // CHECK-LABEL: func @block_matmul_transpose_a( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x64xf32>, %[[B:[0-9a-z]+]]: tensor<128x64xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x64xf32> -> tensor<2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x64xf32> -> tensor<4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> @@ -247,22 +247,22 @@ func.func @block_batch_matmul_transpose_a( // CHECK-LABEL: func @block_batch_matmul_transpose_a( // CHECK-SAME: %[[A:.+]]: tensor<512x128x64xf32>, %[[B:.+]]: tensor<512x128x64xf32>, %[[C:.+]]: tensor<512x64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x128x64xf32> -> tensor<512x2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x128x64xf32> -> tensor<512x4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32> @@ -283,22 +283,22 @@ func.func @block_matmul_transpose_b( // CHECK-LABEL: func @block_matmul_transpose_b( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<64x128xf32>, %[[B:[0-9a-z]+]]: tensor<64x128xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<64x128xf32> -> tensor<2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<64x128xf32> -> tensor<4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> @@ -319,22 +319,22 @@ func.func @block_batch_matmul_transpose_b( // CHECK-LABEL: func @block_batch_matmul_transpose_b( // CHECK-SAME: %[[A:.+]]: tensor<512x64x128xf32>, %[[B:.+]]: tensor<512x64x128xf32>, %[[C:.+]]: tensor<512x64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x64x128xf32> -> tensor<512x2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x64x128xf32> -> tensor<512x4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32> @@ -365,22 +365,22 @@ func.func @block_generic_matmul( // CHECK-LABEL: func @block_generic_matmul( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>, %[[C:[0-9a-z]+]]: tensor<128x128xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x128xf32> -> tensor<4x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x128xf32> -> tensor<8x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<128x128xf32> -> tensor<4x8x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32> @@ -411,22 +411,22 @@ func.func @block_generic_matmul_transpose_a( // CHECK-LABEL: func @block_generic_matmul_transpose_a( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x64xf32>, %[[B:[0-9a-z]+]]: tensor<128x64xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x64xf32> -> tensor<2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x64xf32> -> tensor<4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> @@ -457,22 +457,22 @@ func.func @block_generic_matmul_transpose_b( // CHECK-LABEL: func @block_generic_matmul_transpose_b( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<64x128xf32>, %[[B:[0-9a-z]+]]: tensor<64x128xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<64x128xf32> -> tensor<2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<64x128xf32> -> tensor<4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> @@ -498,10 +498,10 @@ func.func @non_contraction_generic( // CHECK-LABEL: func @non_contraction_generic( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<64x128xf32> // CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK-NOT: tensor.pack +// CHECK-NOT: linalg.pack // CHECK: %[[GENERIC:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]]] // CHECK-SAME: iterator_types = ["parallel", "parallel"] // CHECK-SAME: outs(%[[A]] : tensor<64x128xf32>) -// CHECK-NOT: tensor.unpack +// CHECK-NOT: linalg.unpack // CHECK: return %[[GENERIC]] : tensor<64x128xf32> diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir index cd439cd23ecd0..db4f6181f517c 100644 --- a/mlir/test/Dialect/Linalg/canonicalize.mlir +++ b/mlir/test/Dialect/Linalg/canonicalize.mlir @@ -357,7 +357,7 @@ func.func @fill_pack() -> tensor<24x32x16x16xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<24x32x16x16xf32> %1 = linalg.fill ins(%cst : f32) outs(%dest : tensor<384x512xf32>) -> tensor<384x512xf32> - %pack = tensor.pack %1 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %0 : tensor<384x512xf32> -> tensor<24x32x16x16xf32> + %pack = linalg.pack %1 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %0 : tensor<384x512xf32> -> tensor<24x32x16x16xf32> return %pack : tensor<24x32x16x16xf32> } // CHECK-LABEL: func.func @fill_pack @@ -374,7 +374,7 @@ func.func @fill_pack_general() -> tensor<1x1x8x4x4x8xi32>{ %extracted_slice_15 = tensor.extract_slice %9[0, 0, 0, 0] [1, 1, 16, 64] [1, 1, 1, 1] : tensor<1x1x16x64xi32> to tensor<1x1x16x64xi32> %16 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_15 : tensor<1x1x16x64xi32>) -> tensor<1x1x16x64xi32> %0 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x4x8xi32> to tensor<1x1x8x4x4x8xi32> - %pack_18 = tensor.pack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x1x16x64xi32> -> tensor<1x1x8x4x4x8xi32> + %pack_18 = linalg.pack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x1x16x64xi32> -> tensor<1x1x8x4x4x8xi32> return %pack_18 : tensor<1x1x8x4x4x8xi32> } @@ -397,7 +397,7 @@ func.func @dynamic_fill_pack(%arg0: tensor) -> tensor { %1 = affine.apply #map()[%dim] %2 = affine.apply #map()[%dim_0] %3 = tensor.empty(%1, %2) : tensor - %pack = tensor.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %3 : tensor -> tensor + %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %3 : tensor -> tensor return %pack : tensor } // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1249,3 +1249,499 @@ func.func @recursive_effect(%arg : tensor<1xf32>) { // CHECK-LABEL: @recursive_effect // CHECK: linalg.map + +//===----------------------------------------------------------------------===// +// linalg.pack +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func @fold_pack_constant_splat +// CHECK-NOT: linalg.pack +// CHECK: arith.constant dense<1.000000e-01> : tensor<8x16x8x32xf32> +func.func @fold_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { + %cst = arith.constant dense<1.000000e-01> : tensor<64x128xf32> + %0 = linalg.pack %cst outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest : tensor<64x128xf32> -> tensor<8x16x8x32xf32> + return %0 : tensor<8x16x8x32xf32> +} + +// ----- + +// CHECK-LABEL: func @fold_padding_value_pack_constant_splat +// CHECK-NOT: linalg.pack +// CHECK: arith.constant dense<1.000000e-01> : tensor<8x16x8x32xf32> +func.func @fold_padding_value_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { + %pad = arith.constant 1.000000e-01 : f32 + %cst = arith.constant dense<1.000000e-01> : tensor<63x127xf32> + %0 = linalg.pack %cst + padding_value(%pad : f32) + outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest : tensor<63x127xf32> -> tensor<8x16x8x32xf32> + return %0 : tensor<8x16x8x32xf32> +} + + +// ----- + +// CHECK-LABEL: func @nofold_padding_value_pack_constant_splat +// CHECK: arith.constant dense<1.000000e-01> : tensor<63x127xf32> +// CHECK: linalg.pack +func.func @nofold_padding_value_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { + %pad = arith.constant 0.0 : f32 + %cst = arith.constant dense<1.000000e-01> : tensor<63x127xf32> + %0 = linalg.pack %cst + padding_value(%pad : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [0, 1] + inner_tiles = [8, 32] + into %dest : tensor<63x127xf32> -> tensor<8x16x8x32xf32> + return %0 : tensor<8x16x8x32xf32> +} + +// ----- + +func.func @fold_padding_value_pack(%arg0: tensor<1200x500000xf32>) -> tensor<31250x1200x16x1xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<31250x1200x16x1xf32> + %pack = linalg.pack %arg0 + padding_value(%cst : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [1, 0] + inner_tiles = [16, 1] + into %0 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32> + return %pack : tensor<31250x1200x16x1xf32> +} +// CHECK-LABEL: func @fold_padding_value_pack +// CHECK-NOT: padding_value + +// ----- + +func.func @infer_src_shape_pack(%src: tensor, %dest: tensor<10x20x30x40x16xf32>) -> tensor<10x20x30x40x16xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %src + padding_value(%cst : f32) + outer_dims_perm = [2, 1, 3, 0] + inner_dims_pos = [2] + inner_tiles = [16] + into %dest : tensor -> tensor<10x20x30x40x16xf32> + return %pack : tensor<10x20x30x40x16xf32> +} +// CHECK-LABEL: func.func @infer_src_shape_pack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] +// CHECK: %[[CAST_SRC:.+]] = tensor.cast %[[SRC]] : tensor to tensor<40x20x?x30xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[CAST_SRC]] {{.+}} into %[[DEST]] +// CHECK: return %[[PACK]] + +// ----- + +func.func @infer_dest_shape_pack(%src: tensor<30x20x?x10xf32>, %dest: tensor) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %src + padding_value(%cst : f32) + outer_dims_perm = [2, 1, 3, 0] + inner_dims_pos = [2] + inner_tiles = [16] + into %dest : tensor<30x20x?x10xf32> -> tensor + return %pack : tensor +} +// CHECK-LABEL: func.func @infer_dest_shape_pack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] +// CHECK: %[[CAST_DEST:.+]] = tensor.cast %[[DEST]] : tensor to tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[SRC]] {{.+}} into %[[CAST_DEST]] +// CHECK: %[[CAST_PACK:.+]] = tensor.cast %[[PACK]] : tensor to tensor +// CHECK: return %[[CAST_PACK]] + +// ----- + +func.func @no_infer_pack_shape(%arg0: tensor, %arg1: index) -> tensor<32x7x?x16x1xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty(%arg1) : tensor<32x7x?x16x1xf32> + %pack = linalg.pack %arg0 padding_value(%cst : f32) outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %0 : tensor -> tensor<32x7x?x16x1xf32> + return %pack : tensor<32x7x?x16x1xf32> +} +// CHECK-LABEL: func.func @no_infer_pack_shape +// CHECK-NOT: tensor.cast + +// ----- + +func.func @fold_padding_value_pack_negative1(%arg0: tensor<1200x499999xf32>) -> tensor<31250x1200x16x1xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<31250x1200x16x1xf32> + %pack = linalg.pack %arg0 + padding_value(%cst : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [1, 0] + inner_tiles = [16, 1] + into %0 : tensor<1200x499999xf32> -> tensor<31250x1200x16x1xf32> + return %pack : tensor<31250x1200x16x1xf32> +} +// CHECK-LABEL: func @fold_padding_value_pack_negative1 +// CHECK: linalg.pack +// CHECK-SAME: padding_value + +// ----- + +func.func @fold_padding_value_pack_negative2(%arg0: tensor<1200x?xf32>, %arg1: tensor) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %arg0 + padding_value(%cst : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [1, 0] + inner_tiles = [16, 1] + into %arg1 : tensor<1200x?xf32> -> tensor + return %pack : tensor +} +// CHECK-LABEL: func @fold_padding_value_pack_negative2 +// CHECK: linalg.pack +// CHECK-SAME: padding_value + +// ----- + +func.func @fold_padding_value_pack_negative3(%arg0: tensor<1200x500000xf32>, %arg1: tensor, %tile : index) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %arg0 + padding_value(%cst : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [1, 0] + inner_tiles = [%tile, 1] + into %arg1 : tensor<1200x500000xf32> -> tensor + return %pack : tensor +} +// CHECK-LABEL: func @fold_padding_value_pack_negative3 +// CHECK: linalg.pack +// CHECK-SAME: padding_value + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.unpack +//===----------------------------------------------------------------------===// + + +// CHECK-LABEL: func @fold_unpack_constant_splat +// CHECK-NOT: linalg.unpack +// CHECK: arith.constant dense<1.000000e-01> : tensor<128x256xf32> +func.func @fold_unpack_constant_splat(%dest : tensor<128x256xf32>) -> tensor<128x256xf32> { + %cst = arith.constant dense<1.000000e-01> : tensor<16x8x8x32xf32> + %0 = linalg.unpack %cst inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32> + return %0 : tensor<128x256xf32> +} + +// ----- + +func.func @infer_dest_shape_unpack(%src: tensor<10x20x30x40x16xf32>, %dest: tensor) -> tensor { + %unpack = linalg.unpack %src + outer_dims_perm = [2, 1, 3, 0] + inner_dims_pos = [2] + inner_tiles = [16] + into %dest : tensor<10x20x30x40x16xf32> -> tensor + return %unpack : tensor +} +// CHECK-LABEL: func.func @infer_dest_shape_unpack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] +// CHECK: %[[CAST_DEST:.+]] = tensor.cast %[[DEST]] : tensor to tensor<40x20x?x30xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SRC]] {{.+}} into %[[CAST_DEST]] +// CHECK: %[[CAST_UNPACK:.+]] = tensor.cast %[[UNPACK]] : tensor<40x20x?x30xf32> to tensor +// CHECK: return %[[CAST_UNPACK]] + +// ----- + +func.func @infer_src_shape_unpack(%src: tensor, %dest: tensor<30x20x?x10xf32>) -> tensor<30x20x?x10xf32> { + %unpack = linalg.unpack %src + outer_dims_perm = [2, 1, 3, 0] + inner_dims_pos = [2] + inner_tiles = [16] + into %dest : tensor -> tensor<30x20x?x10xf32> + return %unpack : tensor<30x20x?x10xf32> +} +// CHECK-LABEL: func.func @infer_src_shape_unpack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] +// CHECK: %[[CAST_SRC:.+]] = tensor.cast %[[SRC]] : tensor to tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[CAST_SRC]] +// CHECK: return %[[UNPACK]] + +// ----- + +func.func @no_infer_unpack_shape(%arg1: tensor<32x7x?x16x1xf32>, %arg2: index) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty(%arg2) : tensor + %unpack = linalg.unpack %arg1 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %0 : tensor<32x7x?x16x1xf32> -> tensor + return %unpack : tensor +} +// CHECK-LABEL: func.func @no_infer_unpack_shape +// CHECK-NOT: tensor.cast + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.pack + linalg.unpack +//===----------------------------------------------------------------------===// + +// Chain: NC -> NCnc -> NCnc -> NC +// CHECK: func.func @unpack_pack( +// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) +// CHECK: return %[[T]] : tensor<128x128xf32> +func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { + %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> + %packed = linalg.pack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + %tensor_empty1 = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor<128x128xf32> + return %unpacked : tensor<128x128xf32> +} + +// ----- + +// Chain: NC -> NCcn -> NCnc -> NC +// CHECK: func.func @unpack_pack( +// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) +// CHECK-NOT: return %[[T]] : tensor<128x128xf32> +func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { + %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> + %packed = linalg.pack %t inner_dims_pos = [1, 0] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + %tensor_empty1 = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor +<128x128xf32> + return %unpacked : tensor<128x128xf32> +} + +// ----- + +// Chain: NC -> CNcn -> NCnc -> NC +// CHECK: func.func @unpack_pack( +// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) +// CHECK-NOT: return %[[T]] : tensor<128x128xf32> +func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { + %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> + %packed = linalg.pack %t outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + %tensor_empty1 = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor +<128x128xf32> + return %unpacked : tensor<128x128xf32> +} + +// ----- + +// Chain: NC -> NCnc -> NCnc -> NC +// CHECK: func.func @unpack_pack( +// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>, +// CHECK: return %[[T]] : tensor<128x128xf32> +func.func @unpack_pack(%t: tensor<128x128xf32>, %tile1: index, %tile2: index) -> tensor<128x128xf32> { + %tensor_empty = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> + %packed = linalg.pack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x?x?xf32> + %tensor_empty1 = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<16x16x?x?xf32> -> tensor +<128x128xf32> + return %unpacked : tensor<128x128xf32> +} + +// ----- + +// CHECK: func.func @unpack_pack_with_padding_no_canonicalization( +// CHECK: linalg.pack +// CHECK: linalg.unpack +func.func @unpack_pack_with_padding_no_canonicalization(%t: tensor<256x512xbf16>) -> tensor<224x512xbf16> { + %tensor_empty = tensor.empty() : tensor<4x16x64x32xbf16> + %tensor_empty1 = tensor.empty() : tensor<224x512xbf16> + %packed = linalg.pack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty : tensor<256x512xbf16> -> tensor<4x16x64x32xbf16> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty1 : tensor<4x16x64x32xbf16> -> tensor<224x512xbf16> + return %unpacked : tensor<224x512xbf16> +} + +// ----- + +// Chain NCnc -> NC -> NC -> NCnc +// CHECK: func.func @pack_unpack( +// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, +// CHECK: return %[[T]] : tensor<16x16x?x?xf32> +func.func @pack_unpack(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { + %tensor_empty = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> + %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> + return %packed : tensor<16x16x?x?xf32> +} + +// ----- + +// Chain NCnc -> NC -> NC -> NCnc +// CHECK: func.func @pack_unpack( +// CHECK-SAME: %[[T:.+]]: tensor<16x16x8x8xf32> +// CHECK: return %[[T]] : tensor<16x16x8x8xf32> +func.func @pack_unpack(%t: tensor<16x16x8x8xf32>) -> tensor<16x16x8x8xf32> { + %tensor_empty = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<16x16x8x8xf32> -> tensor<128x128xf32> + %tensor_empty1 = tensor.empty() : tensor<16x16x8x8xf32> + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + return %packed : tensor<16x16x8x8xf32> +} + +// ----- + +// CHECK: func.func @pack_unpack_same_tiles( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK: return %[[T]] : tensor +func.func @pack_unpack_same_tiles(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, + %tile1: index, %tile2: index) -> tensor { + %tensor_empty = tensor.empty(%dim1, %dim2) : tensor + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor + %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor -> tensor + return %packed : tensor +} + +// ----- + +// CHECK: func.func @pack_unpack_different_tiles( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK-NOT: return %[[T]] : tensor +func.func @pack_unpack_different_tiles(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, + %tile1: index, %tile2: index) -> tensor { + %tensor_empty = tensor.empty(%dim1, %dim2) : tensor + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor + %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile2, %tile1] into %tensor_empty1 : tensor -> tensor + return %packed : tensor +} + +// ----- + +// CHECK: func.func @pack_unpack_dynamic_with_padding( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK-NOT: return %[[T]] : tensor +func.func @pack_unpack_dynamic_with_padding(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, + %tile1: index, %tile2: index, %pad: f32) -> tensor { + %tensor_empty = tensor.empty(%dim1, %dim2) : tensor + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor + %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor + %packed = linalg.pack %unpacked padding_value(%pad: f32) inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor -> tensor + return %packed : tensor +} + +// ----- + +// CHECK: func.func @pack_outer_dims_unpack_no_outer_dims( +// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, +// CHECK: return %[[T]] : tensor<16x16x?x?xf32> +func.func @pack_outer_dims_unpack_no_outer_dims(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { + %tensor_empty = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> + %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> + %packed = linalg.pack %unpacked outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> + return %packed : tensor<16x16x?x?xf32> +} + +// ----- + +// CHECK: func.func @pack_no_outer_dims_unpack_outer_dims( +// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, +// CHECK: return %[[T]] : tensor<16x16x?x?xf32> +func.func @pack_no_outer_dims_unpack_outer_dims(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { + %tensor_empty = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> + %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> + return %packed : tensor<16x16x?x?xf32> +} + +// ----- + +//===----------------------------------------------------------------------===// +// tensor.cast + linalg.pack +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func.func @fold_cast_pack_dynamic_tile_size +// CHECK-SAME: %[[DEST:.*]]: tensor<1x1x8x1xi32>, +// CHECK-SAME: %[[SRC:.*]]: tensor<7x?xi32>, +// CHECK-SAME: %[[PAD:.*]]: i32) -> tensor<1x1x8x1xi32> { +// CHECK: %[[PACK:.*]] = linalg.pack %[[SRC]] padding_value(%[[PAD]] : i32) +// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] +// CHECK-SAME: test_attr +// CHECK-SAME: : tensor<7x?xi32> -> tensor<1x1x8x1xi32> +// CHECK: return %[[PACK]] : tensor<1x1x8x1xi32> +func.func @fold_cast_pack_dynamic_tile_size( + %dest: tensor<1x1x8x1xi32>, + %src: tensor<7x?xi32>, + %pad: i32) -> tensor<1x1x8x1xi32> { + + %cast = tensor.cast %dest : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> + %c8 = arith.constant 8 : index + %pack = linalg.pack %src padding_value(%pad : i32) + inner_dims_pos = [0, 1] + inner_tiles = [%c8, 1] + into %cast {test_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32> + %res = tensor.cast %pack : tensor<1x1x?x1xi32> to tensor<1x1x8x1xi32> + return %res : tensor<1x1x8x1xi32> +} + +// ----- + +func.func @infer_and_fold_pack_unpack_same_tiles(%t: tensor<10x20x4x4xf32>) -> tensor<10x20x4x4xf32> { + %dim1 = arith.constant 40 : index + %dim2 = arith.constant 80 : index + %tensor_empty = tensor.empty(%dim1, %dim2) : tensor + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %tensor_empty : tensor<10x20x4x4xf32> -> tensor + %cast = tensor.cast %unpacked : tensor to tensor<40x80xf32> + %tensor_empty1 = tensor.empty() : tensor<10x20x4x4xf32> + %packed = linalg.pack %cast inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %tensor_empty1 : tensor<40x80xf32> -> tensor<10x20x4x4xf32> + return %packed : tensor<10x20x4x4xf32> +} +// CHECK-LABEL: func.func @infer_and_fold_pack_unpack_same_tiles +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK: return %[[SRC]] + +// ----- + +// CHECK-LABEL: func.func @pack_dont_drop_attributes( +// CHECK: linalg.pack {{.*}} {test_attr} +func.func @pack_dont_drop_attributes(%arg0: tensor, %arg1: tensor<128x?x100x16x1xf16>) -> tensor<128x?x100x16x1xf16> { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f16 + %pack = linalg.pack %arg0 padding_value(%cst : f16) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %arg1 {test_attr} : tensor -> tensor<128x?x100x16x1xf16> + return %pack : tensor<128x?x100x16x1xf16> +} +// ----- + +//===----------------------------------------------------------------------===// +// linalg.fill + linalg.unpack +//===----------------------------------------------------------------------===// +// Fold DstStyleOp -> tensor.unpack operations. +func.func @fold_dst_style_ops_into_unpack(%arg0 : tensor, %init : tensor) -> tensor { + %cst = arith.constant 0.0 : f32 + %fill = linalg.fill ins(%cst : f32) outs(%init : tensor) -> tensor + %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %fill : tensor -> tensor + return %unpack : tensor +} +// CHECK-LABEL: func @fold_dst_style_ops_into_unpack +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-SAME: %[[INIT:.+]]: tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] +// CHECK-SAME: into %[[INIT]] +// CHECK: return %[[UNPACK]] + +// ----- + +//===----------------------------------------------------------------------===// +// tensor.cast + linalg.unpack +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func.func @fold_cast_unpack_dynamic_tile_size( +// CHECK-SAME: %[[SRC:.*]]: tensor<1x1x8x1xi32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<7x?xi32>) -> tensor<7x?xi32> { +// CHECK: %[[RES:.*]] = linalg.unpack %[[SRC]] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] {test_attr} : tensor<1x1x8x1xi32> -> tensor<7x?xi32> +// CHECK: return %[[RES]] : tensor<7x?xi32> +func.func @fold_cast_unpack_dynamic_tile_size( + %src: tensor<1x1x8x1xi32>, + %res: tensor<7x?xi32>) -> tensor<7x?xi32> { + + %cast = tensor.cast %src : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> + %c8 = arith.constant 8 : index + %unpack = linalg.unpack %cast + inner_dims_pos = [0, 1] + inner_tiles = [%c8, 1] + into %res {test_attr} : tensor<1x1x?x1xi32> -> tensor<7x?xi32> + return %unpack : tensor<7x?xi32> +} diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir index b2b29b2b2fee2..19d4524a2ec06 100644 --- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir +++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir @@ -15,7 +15,7 @@ func.func @dynamic_elem_pack(%arg0: tensor, %dest: tensor) %4 = arith.addf %arg3, %arg3 : f32 linalg.yield %4 : f32 } -> tensor - %4 = tensor.pack %3 + %4 = linalg.pack %3 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor @@ -34,7 +34,7 @@ func.func @dynamic_elem_pack(%arg0: tensor, %dest: tensor) // CHECK-DAG: %[[OUTER_D0:.+]] = affine.apply #[[$MAP0]]()[%[[D0]]] // CHECK-DAG: %[[OUTER_D1:.+]] = affine.apply #[[$MAP1]]()[%[[D1]]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty(%[[OUTER_D0]], %[[OUTER_D1]]) : tensor -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 2] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ELEM:.+]] = linalg.generic @@ -62,7 +62,7 @@ func.func @dynamic_elem_pack_padding_value(%arg0: tensor, %dest: tensor %4 = arith.addf %arg3, %arg3 : f32 linalg.yield %4 : f32 } -> tensor - %4 = tensor.pack %3 padding_value(%cst : f32) + %4 = linalg.pack %3 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor @@ -70,7 +70,7 @@ func.func @dynamic_elem_pack_padding_value(%arg0: tensor, %dest: tensor } // CHECK-LABEL: func.func @dynamic_elem_pack_padding_value // CHECK: %[[GENERIC:.+]] = linalg.generic -// CHECK: tensor.pack %[[GENERIC]] +// CHECK: linalg.pack %[[GENERIC]] // ----- @@ -84,7 +84,7 @@ func.func @elem_pack_transpose_inner_dims(%arg0: tensor<128x256xi32>, %dest: ten %4 = arith.addi %arg3, %arg3 : i32 linalg.yield %4 : i32 } -> tensor<128x256xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %dest : tensor<128x256xi32> -> tensor<4x16x16x32xi32> @@ -95,7 +95,7 @@ func.func @elem_pack_transpose_inner_dims(%arg0: tensor<128x256xi32>, %dest: ten // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x16x32xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ELEM:.+]] = linalg.generic @@ -117,7 +117,7 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %dest: ten %4 = arith.addi %arg3, %arg3 : i32 linalg.yield %4 : i32 } -> tensor<128x256xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] @@ -129,7 +129,7 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %dest: ten // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<16x4x32x16xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[ARG0_EMPTY]] : tensor<128x256xi32> -> tensor<16x4x32x16xi32> // CHECK: %[[ELEM:.+]] = linalg.generic @@ -151,7 +151,7 @@ func.func @elem_pack_transpose_inner_and_outer_dims(%arg0: tensor<128x256xi32>, %4 = arith.addi %arg3, %arg3 : i32 linalg.yield %4 : i32 } -> tensor<128x256xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 32] @@ -163,7 +163,7 @@ func.func @elem_pack_transpose_inner_and_outer_dims(%arg0: tensor<128x256xi32>, // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<16x4x16x32xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ELEM:.+]] = linalg.generic @@ -191,7 +191,7 @@ func.func @dynamic_broadcast_pack(%arg0: tensor, %arg1: tensor, %d %4 = arith.addf %arg3, %arg4 : f32 linalg.yield %4 : f32 } -> tensor - %4 = tensor.pack %3 + %4 = linalg.pack %3 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor @@ -210,13 +210,13 @@ func.func @dynamic_broadcast_pack(%arg0: tensor, %arg1: tensor, %d // CHECK-DAG: %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]] // CHECK-DAG: %[[OUTER_D0:.+]] = affine.apply #[[$MAP0]]()[%[[D0]]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty(%[[OUTER_D0]]) : tensor -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [8] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK-DAG: %[[D1:.+]] = tensor.dim %[[ARG1]], %[[C0]] // CHECK-DAG: %[[OUTER_D1:.+]] = affine.apply #[[$MAP1]]()[%[[D1]]] // CHECK: %[[ARG1_EMPTY:.+]] = tensor.empty(%[[OUTER_D1]]) : tensor -// CHECK: %[[PACK_ARG1:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[PACK_ARG1:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [2] // CHECK-SAME: into %[[ARG1_EMPTY]] // CHECK: %[[ELEM:.+]] = linalg.generic @@ -240,7 +240,7 @@ func.func @elem_pack_transpose_inner_and_outer_dims2(%arg0: tensor<64xf32>, %des ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } -> tensor<1x56x57x64xf32> - %2 = tensor.pack %1 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %dest : tensor<1x56x57x64xf32> -> tensor<1x2x56x57x32xf32> + %2 = linalg.pack %1 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %dest : tensor<1x56x57x64xf32> -> tensor<1x2x56x57x32xf32> return %2 : tensor<1x2x56x57x32xf32> } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d1, d4)> @@ -249,7 +249,7 @@ func.func @elem_pack_transpose_inner_and_outer_dims2(%arg0: tensor<64xf32>, %des // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<2x32xf32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -275,7 +275,7 @@ func.func @transpose_pack(%arg0: tensor<100x128x200x256xi32>, %arg1: tensor<100x %1 = arith.addi %0, %b2 : i32 linalg.yield %1 : i32 } -> tensor<100x200x128x256xi32> - %4 = tensor.pack %transpose + %4 = linalg.pack %transpose inner_dims_pos = [3, 2] inner_tiles = [16, 32] into %dest : tensor<100x200x128x256xi32> -> tensor<100x200x4x16x16x32xi32> @@ -291,11 +291,11 @@ func.func @transpose_pack(%arg0: tensor<100x128x200x256xi32>, %arg1: tensor<100x // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<100x4x200x16x16x32xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [3, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ARG2_EMPTY:.+]] = tensor.empty() : tensor<4x32xi32> -// CHECK: %[[PACKED_ARG2:.+]] = tensor.pack %[[ARG2]] +// CHECK: %[[PACKED_ARG2:.+]] = linalg.pack %[[ARG2]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[ARG2_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -321,7 +321,7 @@ func.func @affine_constant_expr_pack(%arg0: tensor<100x128x200x256xi32>, %arg1: %1 = arith.addi %0, %b2 : i32 linalg.yield %1 : i32 } -> tensor<100x200x128x256xi32> - %4 = tensor.pack %transpose + %4 = linalg.pack %transpose inner_dims_pos = [3, 2] inner_tiles = [16, 32] into %dest : tensor<100x200x128x256xi32> -> tensor<100x200x4x16x16x32xi32> @@ -337,11 +337,11 @@ func.func @affine_constant_expr_pack(%arg0: tensor<100x128x200x256xi32>, %arg1: // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<100x4x200x16x16x32xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [3, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ARG2_EMPTY:.+]] = tensor.empty() : tensor<1x4x1x1x32xi32> -// CHECK: %[[PACKED_ARG2:.+]] = tensor.pack %[[ARG2]] +// CHECK: %[[PACKED_ARG2:.+]] = linalg.pack %[[ARG2]] // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[ARG2_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -367,7 +367,7 @@ func.func @transpose_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %a %1 = arith.addi %0, %b2 : i32 linalg.yield %1 : i32 } -> tensor<100x200x128x256xi32> - %4 = tensor.pack %transpose + %4 = linalg.pack %transpose outer_dims_perm = [1, 2, 3, 0] inner_dims_pos = [3, 2] inner_tiles = [16, 32] @@ -384,11 +384,11 @@ func.func @transpose_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %a // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<200x4x16x100x16x32xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 3, 0] inner_dims_pos = [3, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ARG2_EMPTY:.+]] = tensor.empty() : tensor<4x32xi32> -// CHECK: %[[PACKED_ARG2:.+]] = tensor.pack %[[ARG2]] +// CHECK: %[[PACKED_ARG2:.+]] = linalg.pack %[[ARG2]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[ARG2_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -408,7 +408,7 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %init: ten linalg.yield %4 : i32 } -> tensor<128x256xi32> %empty = tensor.empty() : tensor<16x4x32x16xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] @@ -421,11 +421,11 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %init: ten // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK: %[[ARG1_EMPTY:.+]] = tensor.empty() : tensor<16x4x32x16xi32> -// CHECK: %[[PACKED_ARG1:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[PACKED_ARG1:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[ARG1_EMPTY]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<16x4x32x16xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -439,7 +439,7 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %init: ten func.func @unpack_on_output(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56x64xf32> { %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1 : tensor<12x56x56x64xf32>) { ^bb0(%out: f32): %3 = arith.addf %out, %out : f32 @@ -452,17 +452,17 @@ func.func @unpack_on_output(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56 // CHECK-LABEL: func.func @unpack_on_output // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY_UNPACK:.+]] = tensor.empty() : tensor<12x56x56x64xf32> -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_EMPTY_UNPACK]] // CHECK: %[[ARG0_EMPTY_PACK:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_EMPTY_PACK]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]]] // CHECK-SAME: outs(%[[PACKED_ARG0]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[UNPACKED_ARG0]] @@ -472,7 +472,7 @@ func.func @unpack_on_output(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56 func.func @unpack_on_input(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56x56x64xf32>) -> tensor<12x56x56x64xf32> { %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1: tensor<12x56x56x64xf32>) outs(%init : tensor<12x56x56x64xf32>) { ^bb0(%in: f32, %out: f32): %3 = arith.addf %in, %out : f32 @@ -486,22 +486,22 @@ func.func @unpack_on_input(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32> -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_UNPACK_EMPTY]] // CHECK: %[[ARG1_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[ARG1_PACK:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1_PACK_EMPTY]] // CHECK: %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[ARG0_PACK:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[ARG0_PACK:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_PACK_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]]] // CHECK-SAME: ins(%[[ARG0_PACK]] // CHECK-SAME: outs(%[[ARG1_PACK]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1]] @@ -511,7 +511,7 @@ func.func @unpack_on_input(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56 func.func @unpack_element_type_change(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56x56x64xf16>) -> tensor<12x56x56x64xf16> { %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1: tensor<12x56x56x64xf32>) outs(%init : tensor<12x56x56x64xf16>) { ^bb0(%in: f32, %out: f16): %3 = arith.truncf %in : f32 to f16 @@ -525,22 +525,22 @@ func.func @unpack_element_type_change(%arg0: tensor<12x2x56x56x32xf32>, %init: t // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32> -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_UNPACK_EMPTY]] // CHECK: %[[ARG1_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf16> -// CHECK: %[[ARG1_PACK:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1_PACK_EMPTY]] // CHECK: %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[ARG0_PACK:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[ARG0_PACK:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_PACK_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]]] // CHECK-SAME: ins(%[[ARG0_PACK]] // CHECK-SAME: outs(%[[ARG1_PACK]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1]] @@ -551,7 +551,7 @@ func.func @unpack_element_type_change(%arg0: tensor<12x2x56x56x32xf32>, %init: t func.func @forward_tensor_empty(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56x64xf32> { %init = tensor.empty() : tensor<12x56x56x64xf32> %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1: tensor<12x56x56x64xf32>) outs(%init : tensor<12x56x56x64xf32>) { ^bb0(%in: f32, %out: f32): %3 = arith.addf %in, %in : f32 @@ -565,19 +565,19 @@ func.func @forward_tensor_empty(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x5 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[FINAL_RES:.+]] = tensor.empty() : tensor<12x56x56x64xf32> // CHECK: %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32> -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_UNPACK_EMPTY]] // CHECK: %[[DEST:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> // CHECK: %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_PACK_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]]] // CHECK-SAME: ins(%[[PACKED_ARG0]] // CHECK-SAME: outs(%[[DEST]] -// CHECK: %[[UNPACKED:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACKED:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[FINAL_RES]] @@ -586,7 +586,7 @@ func.func @forward_tensor_empty(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x5 func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tensor<1x58x58x64xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<1x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> %padded = tensor.pad %1 low[0, 1, 1, 0] high[0, 1, 1, 0] { ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): tensor.yield %cst : f32 @@ -599,7 +599,7 @@ func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tens // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[PADDED:.+]] = tensor.pad %[[ARG0]] low[0, 0, 1, 1, 0] high[0, 0, 1, 1, 0] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x58x58x64xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[PADDED]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[PADDED]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x2x58x58x32xf32> -> tensor<1x58x58x64xf32> @@ -608,7 +608,7 @@ func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tens func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tensor<2x58x58x64xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<1x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> %padded = tensor.pad %1 low[1, 1, 1, 0] high[0, 1, 1, 0] { ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): tensor.yield %cst : f32 @@ -621,7 +621,7 @@ func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tens // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[PADDED:.+]] = tensor.pad %[[ARG0]] low[1, 0, 1, 1, 0] high[0, 0, 1, 1, 0] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2x58x58x64xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[PADDED]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[PADDED]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<2x2x58x58x32xf32> -> tensor<2x58x58x64xf32> @@ -630,7 +630,7 @@ func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tens func.func @pad_along_unpacked_dim(%arg0: tensor<1x2x56x56x32xf32>) -> tensor<1x58x58x66xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<1x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> %padded = tensor.pad %1 low[0, 1, 1, 1] high[0, 1, 1, 1] { ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): tensor.yield %cst : f32 @@ -642,7 +642,7 @@ func.func @pad_along_unpacked_dim(%arg0: tensor<1x2x56x56x32xf32>) -> tensor<1x5 // CHECK: %[[ARG0:.+]]: tensor<1x2x56x56x32xf32>) // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x56x56x64xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> // CHECK: %[[PADDED:.+]] = tensor.pad %[[UNPACK]] low[0, 1, 1, 1] high[0, 1, 1, 1] @@ -656,7 +656,7 @@ func.func @pad_valid_pack_propagation(%arg0: tensor<1x64x56x56xf32>) -> tensor<1 tensor.yield %cst : f32 } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32> %0 = tensor.empty() : tensor<1x2x58x58x32xf32> - %1 = tensor.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> + %1 = linalg.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> return %1 : tensor<1x2x58x58x32xf32> } @@ -664,7 +664,7 @@ func.func @pad_valid_pack_propagation(%arg0: tensor<1x64x56x56xf32>) -> tensor<1 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x64x56x56xf32>) // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x56x56x32xf32> -// CHECK: %[[PACKED:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [32] +// CHECK: %[[PACKED:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x64x56x56xf32> -> tensor<1x2x56x56x32xf32> // CHECK: %[[PADDED:.+]] = tensor.pad %[[PACKED]] low[0, 0, 1, 1, 0] high[0, 0, 1, 1, 0] // CHECK: return %[[PADDED]] @@ -678,7 +678,7 @@ func.func @pad_valid_outer_dims_pack_propagation(%arg0: tensor<1x64x56x56xf32>) tensor.yield %cst : f32 } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32> %0 = tensor.empty() : tensor<1x58x58x2x32xf32> - %1 = tensor.pack %padded outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x58x58x2x32xf32> + %1 = linalg.pack %padded outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x58x58x2x32xf32> return %1 : tensor<1x58x58x2x32xf32> } @@ -686,7 +686,7 @@ func.func @pad_valid_outer_dims_pack_propagation(%arg0: tensor<1x64x56x56xf32>) // CHECK-SAME: %[[ARG0:.+]]: tensor<1x64x56x56xf32>) // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x56x56x2x32xf32> -// CHECK: %[[PACKED:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x64x56x56xf32> -> tensor<1x56x56x2x32xf32> // CHECK: %[[PADDED:.+]] = tensor.pad %[[PACKED]] low[0, 1, 1, 0, 0] high[0, 1, 1, 0, 0] @@ -701,7 +701,7 @@ func.func @pad_along_packed_dim(%arg0: tensor<1x60x56x56xf32>) -> tensor<1x2x58x tensor.yield %cst : f32 } : tensor<1x60x56x56xf32> to tensor<1x64x58x58xf32> %0 = tensor.empty() : tensor<1x2x58x58x32xf32> - %1 = tensor.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> + %1 = linalg.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> return %1 : tensor<1x2x58x58x32xf32> } @@ -710,7 +710,7 @@ func.func @pad_along_packed_dim(%arg0: tensor<1x60x56x56xf32>) -> tensor<1x2x58x // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[PADDED:.+]] = tensor.pad %[[ARG0]] low[0, 2, 1, 1] high[0, 2, 1, 1] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x58x58x32xf32> -// CHECK: tensor.pack %[[PADDED]] inner_dims_pos = [1] inner_tiles = [32] +// CHECK: linalg.pack %[[PADDED]] inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> // ----- @@ -722,7 +722,7 @@ func.func @multi_use_pad_pack_propagation(%arg0: tensor<1x64x56x56xf32>) -> (ten tensor.yield %cst : f32 } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32> %0 = tensor.empty() : tensor<1x2x58x58x32xf32> - %1 = tensor.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> + %1 = linalg.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> return %padded, %1 : tensor<1x64x58x58xf32>, tensor<1x2x58x58x32xf32> } @@ -730,10 +730,10 @@ func.func @multi_use_pad_pack_propagation(%arg0: tensor<1x64x56x56xf32>) -> (ten // CHECK-SAME: %[[ARG0:.+]]: tensor<1x64x56x56xf32>) // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x56x56x32xf32> -// CHECK: %[[PACKED:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [32] +// CHECK: %[[PACKED:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x64x56x56xf32> -> tensor<1x2x56x56x32xf32> // CHECK: %[[PADDED:.+]] = tensor.pad %[[PACKED]] low[0, 0, 1, 1, 0] high[0, 0, 1, 1, 0] -// CHECK: %[[UNPACKED:.+]] = tensor.unpack %[[PADDED]] inner_dims_pos = [1] inner_tiles = [32] +// CHECK: %[[UNPACKED:.+]] = linalg.unpack %[[PADDED]] inner_dims_pos = [1] inner_tiles = [32] // CHECK: return %[[UNPACKED]], %[[PADDED]] // ----- @@ -749,7 +749,7 @@ func.func @would_break_dominance(%arg0: tensor<128x256xi32>) -> tensor<4x16x16x3 linalg.yield %4 : i32 } -> tensor<128x256xi32> %dest = bufferization.alloc_tensor() : tensor<4x16x16x32xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %dest : tensor<128x256xi32> -> tensor<4x16x16x32xi32> @@ -763,7 +763,7 @@ func.func @would_break_dominance(%arg0: tensor<128x256xi32>) -> tensor<4x16x16x3 // CHECK-SAME: ins(%[[ARG0]] // CHECK-SAME: outs(%[[EMPTY]] // CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor() : tensor<4x16x16x32xi32> -// CHECK-NEXT: %{{.+}} = tensor.pack %[[GEN]] +// CHECK-NEXT: %{{.+}} = linalg.pack %[[GEN]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ALLOC]] @@ -779,7 +779,7 @@ func.func @scalar_tensor(%arg0 : tensor) -> tensor<1x32x7x7x32xf32> { linalg.yield %in : f32 } -> tensor<1x7x7x1024xf32> %empty_pack = tensor.empty() : tensor<1x32x7x7x32xf32> - %pack = tensor.pack %gen outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %empty_pack : tensor<1x7x7x1024xf32> -> tensor<1x32x7x7x32xf32> + %pack = linalg.pack %gen outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %empty_pack : tensor<1x7x7x1024xf32> -> tensor<1x32x7x7x32xf32> return %pack : tensor<1x32x7x7x32xf32> } @@ -800,7 +800,7 @@ func.func @scalar_tensor(%arg0 : tensor) -> tensor<1x32x7x7x32xf32> { func.func @unpack_empty_inner_dims(%arg0: tensor<12x64x56x56xf32>) -> tensor<12x56x56x64xf32> { %init = tensor.empty() : tensor<12x56x56x64xf32> %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] into %0 : tensor<12x64x56x56xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] into %0 : tensor<12x64x56x56xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1: tensor<12x56x56x64xf32>) outs(%init : tensor<12x56x56x64xf32>) { ^bb0(%in: f32, %out: f32): %3 = arith.addf %in, %in : f32 @@ -810,13 +810,13 @@ func.func @unpack_empty_inner_dims(%arg0: tensor<12x64x56x56xf32>) -> tensor<12x } // CHECK-LABEL: func.func @unpack_empty_inner_dims -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: ins(%[[PACKED_ARG0]] -// CHECK: %[[UNPACKED:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACKED:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] // ----- @@ -833,7 +833,7 @@ func.func @reduction_pack_transpose_inner_dims(%arg0: tensor<128x256x32xi32>, linalg.yield %4 : i32 } -> tensor<128x256xi32> %dest = tensor.empty() : tensor<4x16x16x32xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %dest : tensor<128x256xi32> -> tensor<4x16x16x32xi32> @@ -845,11 +845,11 @@ func.func @reduction_pack_transpose_inner_dims(%arg0: tensor<128x256x32xi32>, // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK: %[[ARG1_EMPTY:.+]] = tensor.empty() : tensor<4x16x16x32xi32> -// CHECK: %[[PACK_ARG1:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[PACK_ARG1:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG1_EMPTY]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x32x16x32xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[RED:.+]] = linalg.generic @@ -879,7 +879,7 @@ func.func @reduction_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %a linalg.yield %2 : i32 } -> tensor<100x128x256xi32> %init_pack = tensor.empty() : tensor<4x16x100x16x32xi32> - %4 = tensor.pack %reduction + %4 = linalg.pack %reduction outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 32] @@ -897,15 +897,15 @@ func.func @reduction_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %a // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]] // CHECK: %[[ARG3_EMPTY:.+]] = tensor.empty() : tensor<4x16x100x16x32xi32> -// CHECK: %[[PACKED_ARG3:.+]] = tensor.pack %[[ARG3]] +// CHECK: %[[PACKED_ARG3:.+]] = linalg.pack %[[ARG3]] // CHECK-SAME: outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG3_EMPTY]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x200x100x16x32xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 3, 2, 0] inner_dims_pos = [3, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ARG2_EMPTY:.+]] = tensor.empty() : tensor<4x32xi32> -// CHECK: %[[PACKED_ARG2:.+]] = tensor.pack %[[ARG2]] +// CHECK: %[[PACKED_ARG2:.+]] = linalg.pack %[[ARG2]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[ARG2_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -922,7 +922,7 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32 %filter: tensor<2x2xi32>) -> tensor<16x540x960xi32>{ %init = tensor.empty() : tensor<16x540x960xi32> %empty = tensor.empty() : tensor<1x16x1080x1920xi32> - %unpack = tensor.unpack %arg0 + %unpack = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [16] into %empty : tensor<1x1x1080x1920x16xi32> -> tensor<1x16x1080x1920xi32> @@ -944,7 +944,7 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32 // CHECK: %[[FINAL_RES:.+]] = tensor.empty() : tensor<16x540x960xi32> // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x540x960x16xi32> // CHECK: %[[PACK_EMPTY:.+]] = tensor.empty() : tensor<1x1x1080x1920x16xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [16] // CHECK-SAME: into %[[PACK_EMPTY]] // CHECK: %[[POOL:.+]] = linalg.generic @@ -952,7 +952,7 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32 // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"] // CHECK-SAME: ins(%[[PACK_ARG0]], %[[ARG1]] // CHECK-SAME: outs(%[[INIT]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[POOL]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[POOL]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [16] // CHECK-SAME: into %[[FINAL_RES]] // CHECK: return %[[UNPACK]] : tensor<16x540x960xi32> @@ -962,7 +962,7 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32 func.func @bubble_up_pack_through_collapse(%1: tensor, %dim : index) -> tensor { %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor into tensor %2 = tensor.empty(%dim) : tensor - %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor -> tensor + %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor -> tensor func.return %pack : tensor } // CHECK-LABEL: func.func @bubble_up_pack_through_collapse @@ -971,7 +971,7 @@ func.func @bubble_up_pack_through_collapse(%1: tensor, %dim : index) // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor -> tensor // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] : tensor into tensor // CHECK: return %[[COLLAPSED]] : tensor @@ -980,7 +980,7 @@ func.func @bubble_up_pack_through_collapse(%1: tensor, %dim : index) func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm(%1: tensor, %dim : index) -> tensor { %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor into tensor %2 = tensor.empty(%dim) : tensor - %pack = tensor.pack %collapsed inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor -> tensor + %pack = linalg.pack %collapsed inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor -> tensor func.return %pack : tensor } // CHECK-LABEL: func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm @@ -989,7 +989,7 @@ func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm(%1: tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor -> tensor // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] : tensor into tensor // CHECK: return %[[COLLAPSED]] : tensor @@ -998,13 +998,13 @@ func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm(%1: tensor) -> tensor<4x32x3072x8x1xf32> { %collapsed = tensor.collapse_shape %1 [[0], [1, 2], [3]] : tensor<4x192x16x256xf32> into tensor<4x3072x256xf32> %2 = tensor.empty() : tensor<4x32x3072x8x1xf32> - %pack = tensor.pack %collapsed outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 1] into %2 : tensor<4x3072x256xf32> -> tensor<4x32x3072x8x1xf32> + %pack = linalg.pack %collapsed outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 1] into %2 : tensor<4x3072x256xf32> -> tensor<4x32x3072x8x1xf32> func.return %pack : tensor<4x32x3072x8x1xf32> } // CHECK-LABEL: func.func @bubble_up_permuted_pack_through_collapse // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<4x32x192x16x8x1xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<4x192x16x256xf32> -> tensor<4x32x192x16x8x1xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<4x192x16x256xf32> -> tensor<4x32x192x16x8x1xf32> // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %pack {{\[}}[0], [1], [2, 3], [4], [5]] : tensor<4x32x192x16x8x1xf32> into tensor<4x32x3072x8x1xf32> // CHECK: return %[[COLLAPSED]] : tensor<4x32x3072x8x1xf32> @@ -1013,13 +1013,13 @@ func.func @bubble_up_permuted_pack_through_collapse(%1: tensor<4x192x16x256xf32> func.func @bubble_up_pack_through_unit_collapse(%1: tensor<1x64x1x4xf32>) -> tensor<8x4x8x1xf32> { %collapsed = tensor.collapse_shape %1 [[0, 1, 2], [3]] : tensor<1x64x1x4xf32> into tensor<64x4xf32> %2 = tensor.empty() : tensor<8x4x8x1xf32> - %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor<64x4xf32> -> tensor<8x4x8x1xf32> + %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor<64x4xf32> -> tensor<8x4x8x1xf32> func.return %pack : tensor<8x4x8x1xf32> } // CHECK-LABEL: func.func @bubble_up_pack_through_unit_collapse // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x8x1x4x8x1xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<1x64x1x4xf32> -> tensor<1x8x1x4x8x1xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<1x64x1x4xf32> -> tensor<1x8x1x4x8x1xf32> // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1, 2], [3], [4], [5]] : tensor<1x8x1x4x8x1xf32> into tensor<8x4x8x1xf32> // CHECK: return %[[COLLAPSED]] : tensor<8x4x8x1xf32> @@ -1028,7 +1028,7 @@ func.func @bubble_up_pack_through_unit_collapse(%1: tensor<1x64x1x4xf32>) -> ten func.func @bubble_up_pack_through_collapse_on_outer_dims(%1: tensor, %dim : index) -> tensor { %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor into tensor %2 = tensor.empty(%dim) : tensor - %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [4] into %2 : tensor -> tensor + %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [4] into %2 : tensor -> tensor func.return %pack : tensor } // CHECK-LABEL: func.func @bubble_up_pack_through_collapse_on_outer_dims @@ -1037,7 +1037,7 @@ func.func @bubble_up_pack_through_collapse_on_outer_dims(%1: tensor, // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [4] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [4] into %[[EMPTY]] : tensor -> tensor // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3]] : tensor into tensor // CHECK: return %[[COLLAPSED]] : tensor @@ -1046,13 +1046,13 @@ func.func @bubble_up_pack_through_collapse_on_outer_dims(%1: tensor, func.func @no_bubble_up_pack_through_non_divisible_collapse(%1: tensor<3072x64x4xf32>) -> tensor<384x32x8x8xf32> { %collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<3072x64x4xf32> into tensor<3072x256xf32> %2 = tensor.empty() : tensor<384x32x8x8xf32> - %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %2 : tensor<3072x256xf32> -> tensor<384x32x8x8xf32> + %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %2 : tensor<3072x256xf32> -> tensor<384x32x8x8xf32> func.return %pack : tensor<384x32x8x8xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_through_non_divisible_collapse // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<3072x64x4xf32> into tensor<3072x256xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[COLLAPSED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[COLLAPSED]] // CHECK: return %[[PACK]] : tensor<384x32x8x8xf32> // ----- @@ -1060,13 +1060,13 @@ func.func @no_bubble_up_pack_through_non_divisible_collapse(%1: tensor<3072x64x4 func.func @bubble_up_pack_outer_expanded_through_expand(%arg0: tensor<32x64xf32>) -> tensor<4x2x64x4xf32> { %empty = tensor.empty() : tensor<4x2x64x4xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded inner_dims_pos = [1] inner_tiles = [4] into %empty : tensor<4x8x64xf32> -> tensor<4x2x64x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [1] inner_tiles = [4] into %empty : tensor<4x8x64xf32> -> tensor<4x2x64x4xf32> return %pack : tensor<4x2x64x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_outer_expanded_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x64x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [4] into %[[EMPTY]] : tensor<32x64xf32> -> tensor<8x64x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2], [3]] // CHECK-SAME: output_shape [4, 2, 64, 4] : tensor<8x64x4xf32> into tensor<4x2x64x4xf32> @@ -1077,13 +1077,13 @@ func.func @bubble_up_pack_outer_expanded_through_expand(%arg0: tensor<32x64xf32> func.func @bubble_up_pack_inner_expanded_through_expand(%arg0: tensor<32x64xf32>) -> tensor<32x4x4x4xf32> { %empty = tensor.empty() : tensor<32x4x4x4xf32> %expanded = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [32, 4, 16] : tensor<32x64xf32> into tensor<32x4x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [2] inner_tiles = [4] into %empty : tensor<32x4x16xf32> -> tensor<32x4x4x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [2] inner_tiles = [4] into %empty : tensor<32x4x16xf32> -> tensor<32x4x4x4xf32> return %pack : tensor<32x4x4x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_inner_expanded_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x16x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [4] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64xf32> -> tensor<32x16x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0], [1, 2], [3]] @@ -1095,13 +1095,13 @@ func.func @bubble_up_pack_inner_expanded_through_expand(%arg0: tensor<32x64xf32> func.func @bubble_up_pack_non_expanded_dims_through_expand(%arg0: tensor<32x64x16xf32>) -> tensor<8x2x32x16x4xf32> { %empty = tensor.empty() : tensor<8x2x32x16x4xf32> %expanded = tensor.expand_shape %arg0 [[0], [1, 2], [3]] output_shape [32, 2, 32, 16] : tensor<32x64x16xf32> into tensor<32x2x32x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0] inner_tiles = [4] into %empty : tensor<32x2x32x16xf32> -> tensor<8x2x32x16x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0] inner_tiles = [4] into %empty : tensor<32x2x32x16xf32> -> tensor<8x2x32x16x4xf32> return %pack : tensor<8x2x32x16x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_non_expanded_dims_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x64x16x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack +// CHECK: %[[PACK:.+]] = linalg.pack // CHECK-SAME: %[[ARG0]] inner_dims_pos = [0] inner_tiles = [4] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64x16xf32> -> tensor<8x64x16x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0], [1, 2], [3], [4]] @@ -1115,7 +1115,7 @@ func.func @bubble_up_pack_through_expand_dynamic(%arg0: tensor) -> ten %dim = tensor.dim %arg0, %c0 : tensor %empty = tensor.empty(%dim) : tensor %expanded = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [%dim, 4, 16] : tensor into tensor - %pack = tensor.pack %expanded inner_dims_pos = [2] inner_tiles = [8] into %empty : tensor -> tensor + %pack = linalg.pack %expanded inner_dims_pos = [2] inner_tiles = [8] into %empty : tensor -> tensor return %pack : tensor } // CHECK-LABEL: func.func @bubble_up_pack_through_expand_dynamic( @@ -1123,7 +1123,7 @@ func.func @bubble_up_pack_through_expand_dynamic(%arg0: tensor) -> ten // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM_INPUT:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM_INPUT]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [8] into %[[EMPTY]] // CHECK-SAME: : tensor -> tensor // CHECK: %[[DIM_PACK:.+]] = tensor.dim %[[PACK]], %[[C0]] : tensor @@ -1137,14 +1137,14 @@ func.func @bubble_up_pack_non_expanded_padding_through_expand(%arg0: tensor<32x6 %cst = arith.constant 3.000000e+00 : f32 %empty = tensor.empty() : tensor<4x2x8x4x8xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x60xf32> into tensor<4x8x60xf32> - %pack = tensor.pack %expanded padding_value(%cst : f32) inner_dims_pos = [1, 2] inner_tiles = [4, 8] into %empty : tensor<4x8x60xf32> -> tensor<4x2x8x4x8xf32> + %pack = linalg.pack %expanded padding_value(%cst : f32) inner_dims_pos = [1, 2] inner_tiles = [4, 8] into %empty : tensor<4x8x60xf32> -> tensor<4x2x8x4x8xf32> return %pack : tensor<4x2x8x4x8xf32> } // CHECK-LABEL: func.func @bubble_up_pack_non_expanded_padding_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-DAG: %[[CST:.+]] = arith.constant 3.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x8x4x8xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] padding_value(%[[CST]] : f32) +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] padding_value(%[[CST]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %[[EMPTY]] // CHECK-SAME: : tensor<32x60xf32> -> tensor<8x8x4x8xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] @@ -1156,13 +1156,13 @@ func.func @bubble_up_pack_non_expanded_padding_through_expand(%arg0: tensor<32x6 func.func @bubble_up_pack_outer_dims_perm_identity_through_expand(%arg0: tensor<32x64xf32>) -> tensor<4x2x32x4x2xf32> { %empty = tensor.empty() : tensor<4x2x32x4x2xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %empty : tensor<4x8x64xf32> -> tensor<4x2x32x4x2xf32> + %pack = linalg.pack %expanded outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %empty : tensor<4x8x64xf32> -> tensor<4x2x32x4x2xf32> return %pack : tensor<4x2x32x4x2xf32> } // CHECK-LABEL: func.func @bubble_up_pack_outer_dims_perm_identity_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x32x4x2xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 2] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64xf32> -> tensor<8x32x4x2xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] @@ -1174,13 +1174,13 @@ func.func @bubble_up_pack_outer_dims_perm_identity_through_expand(%arg0: tensor< func.func @bubble_up_pack_multiple_dims_through_expand(%arg0: tensor<32x64x16xf32>) -> tensor<8x2x4x8x4x8x2xf32> { %empty = tensor.empty() : tensor<8x2x4x8x4x8x2xf32> %expanded = tensor.expand_shape %arg0 [[0], [1, 2], [3]] output_shape [32, 2, 32, 16] : tensor<32x64x16xf32> into tensor<32x2x32x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0, 2, 3] inner_tiles = [4, 8, 2] into %empty : tensor<32x2x32x16xf32> -> tensor<8x2x4x8x4x8x2xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0, 2, 3] inner_tiles = [4, 8, 2] into %empty : tensor<32x2x32x16xf32> -> tensor<8x2x4x8x4x8x2xf32> return %pack : tensor<8x2x4x8x4x8x2xf32> } // CHECK-LABEL: func.func @bubble_up_pack_multiple_dims_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x8x8x4x8x2xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0, 1, 2] inner_tiles = [4, 8, 2] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64x16xf32> -> tensor<8x8x8x4x8x2xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0], [1, 2], [3], [4], [5], [6]] @@ -1192,13 +1192,13 @@ func.func @bubble_up_pack_multiple_dims_through_expand(%arg0: tensor<32x64x16xf3 func.func @bubble_up_pack_inner_dims_reorder_through_expand(%arg0: tensor<32x64xf32>) -> tensor<4x2x4x16x4xf32> { %empty = tensor.empty() : tensor<4x2x4x16x4xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded inner_dims_pos = [2, 1] inner_tiles = [16, 4] into %empty : tensor<4x8x64xf32> -> tensor<4x2x4x16x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [2, 1] inner_tiles = [16, 4] into %empty : tensor<4x8x64xf32> -> tensor<4x2x4x16x4xf32> return %pack : tensor<4x2x4x16x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_inner_dims_reorder_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x4x16x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64xf32> -> tensor<8x4x16x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] @@ -1210,13 +1210,13 @@ func.func @bubble_up_pack_inner_dims_reorder_through_expand(%arg0: tensor<32x64x func.func @bubble_up_pack_multiple_different_expanded_dims_through_expand(%arg0: tensor<32x64x16xf32>) -> tensor<4x2x2x8x16x4x4xf32> { %empty = tensor.empty() : tensor<4x2x2x8x16x4x4xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3], [4]] output_shape [4, 8, 2, 32, 16] : tensor<32x64x16xf32> into tensor<4x8x2x32x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [1, 3] inner_tiles = [4, 4] into %empty : tensor<4x8x2x32x16xf32> -> tensor<4x2x2x8x16x4x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [1, 3] inner_tiles = [4, 4] into %empty : tensor<4x8x2x32x16xf32> -> tensor<4x2x2x8x16x4x4xf32> return %pack : tensor<4x2x2x8x16x4x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_multiple_different_expanded_dims_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x16x16x4x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64x16xf32> -> tensor<8x16x16x4x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2, 3], [4], [5], [6]] @@ -1228,7 +1228,7 @@ func.func @bubble_up_pack_multiple_different_expanded_dims_through_expand(%arg0: func.func @no_bubble_up_pack_outer_dims_permutation_through_expand(%arg0: tensor<32x64xf32>) -> tensor<32x4x2x4x2xf32> { %empty = tensor.empty() : tensor<32x4x2x4x2xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %empty : tensor<4x8x64xf32> -> tensor<32x4x2x4x2xf32> + %pack = linalg.pack %expanded outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %empty : tensor<4x8x64xf32> -> tensor<32x4x2x4x2xf32> return %pack : tensor<32x4x2x4x2xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_outer_dims_permutation_through_expand( @@ -1236,7 +1236,7 @@ func.func @no_bubble_up_pack_outer_dims_permutation_through_expand(%arg0: tensor // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x4x2x4x2xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2]] // CHECK-SAME: output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] // CHECK-SAME: outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %[[EMPTY]] // CHECK-SAME: : tensor<4x8x64xf32> -> tensor<32x4x2x4x2xf32> // CHECK: return %[[PACK]] : tensor<32x4x2x4x2xf32> @@ -1246,7 +1246,7 @@ func.func @no_bubble_up_pack_outer_dims_permutation_through_expand(%arg0: tensor func.func @no_bubble_up_pack_multiple_same_expanded_dim_through_expand(%arg0: tensor<32x64xf32>) -> tensor<2x2x64x2x4xf32> { %empty = tensor.empty() : tensor<2x2x64x2x4xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %empty : tensor<4x8x64xf32> -> tensor<2x2x64x2x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %empty : tensor<4x8x64xf32> -> tensor<2x2x64x2x4xf32> return %pack : tensor<2x2x64x2x4xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_multiple_same_expanded_dim_through_expand( @@ -1254,7 +1254,7 @@ func.func @no_bubble_up_pack_multiple_same_expanded_dim_through_expand(%arg0: te // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2x2x64x2x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2]] // CHECK-SAME: output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %[[EMPTY]] // CHECK-SAME: : tensor<4x8x64xf32> -> tensor<2x2x64x2x4xf32> // CHECK: return %[[PACK]] : tensor<2x2x64x2x4xf32> @@ -1264,7 +1264,7 @@ func.func @no_bubble_up_pack_multiple_same_expanded_dim_through_expand(%arg0: te func.func @no_bubble_up_pack_non_innermost_expanded_dim_through_expand(%arg0: tensor<32x64xf32>) -> tensor<2x8x64x2xf32> { %empty = tensor.empty() : tensor<2x8x64x2xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0] inner_tiles = [2] into %empty : tensor<4x8x64xf32> -> tensor<2x8x64x2xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0] inner_tiles = [2] into %empty : tensor<4x8x64xf32> -> tensor<2x8x64x2xf32> return %pack : tensor<2x8x64x2xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_non_innermost_expanded_dim_through_expand( @@ -1272,7 +1272,7 @@ func.func @no_bubble_up_pack_non_innermost_expanded_dim_through_expand(%arg0: te // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2x8x64x2xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2]] // CHECK-SAME: output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [2] into %[[EMPTY]] // CHECK-SAME: : tensor<4x8x64xf32> -> tensor<2x8x64x2xf32> // CHECK: return %[[PACK]] : tensor<2x8x64x2xf32> @@ -1283,7 +1283,7 @@ func.func @no_bubble_up_pack_expanded_padding_through_expand_cannot_reassociate( %cst = arith.constant 3.000000e+00 : f32 %empty = tensor.empty() : tensor<3x2x60x8xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [3, 10, 60] : tensor<30x60xf32> into tensor<3x10x60xf32> - %pack = tensor.pack %expanded padding_value(%cst : f32) inner_dims_pos = [1] inner_tiles = [8] into %empty : tensor<3x10x60xf32> -> tensor<3x2x60x8xf32> + %pack = linalg.pack %expanded padding_value(%cst : f32) inner_dims_pos = [1] inner_tiles = [8] into %empty : tensor<3x10x60xf32> -> tensor<3x2x60x8xf32> return %pack : tensor<3x2x60x8xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_expanded_padding_through_expand_cannot_reassociate( @@ -1292,7 +1292,7 @@ func.func @no_bubble_up_pack_expanded_padding_through_expand_cannot_reassociate( // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<3x2x60x8xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2]] // CHECK-SAME: output_shape [3, 10, 60] : tensor<30x60xf32> into tensor<3x10x60xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] padding_value(%[[CST]] : f32) +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] padding_value(%[[CST]] : f32) // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [8] into %[[EMPTY]] // CHECK-SAME: : tensor<3x10x60xf32> -> tensor<3x2x60x8xf32> // CHECK: return %[[PACK]] : tensor<3x2x60x8xf32> @@ -1302,7 +1302,7 @@ func.func @no_bubble_up_pack_expanded_padding_through_expand_cannot_reassociate( func.func @no_bubble_up_pack_extending_dimension_through_expand_cannot_reassociate(%arg0: tensor<32x64xf32>) -> tensor<8x4x16x8xf32> { %empty = tensor.empty() : tensor<8x4x16x8xf32> %expanded = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [32, 4, 16] : tensor<32x64xf32> into tensor<32x4x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0] inner_tiles = [8] into %empty : tensor<32x4x16xf32> -> tensor<8x4x16x8xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0] inner_tiles = [8] into %empty : tensor<32x4x16xf32> -> tensor<8x4x16x8xf32> return %pack : tensor<8x4x16x8xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_extending_dimension_through_expand_cannot_reassociate( @@ -1310,7 +1310,7 @@ func.func @no_bubble_up_pack_extending_dimension_through_expand_cannot_reassocia // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x4x16x8xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] // CHECK-SAME: output_shape [32, 4, 16] : tensor<32x64xf32> into tensor<32x4x16xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [8] into %[[EMPTY]] // CHECK-SAME: : tensor<32x4x16xf32> -> tensor<8x4x16x8xf32> // CHECK: return %[[PACK]] : tensor<8x4x16x8xf32> @@ -1319,7 +1319,7 @@ func.func @no_bubble_up_pack_extending_dimension_through_expand_cannot_reassocia func.func @push_down_unpack_through_expand(%5: tensor, %dim: index, %sz0: index) -> tensor { %6 = tensor.empty(%dim) : tensor - %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor -> tensor + %unpack = linalg.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor -> tensor %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor into tensor func.return %expanded : tensor } @@ -1333,14 +1333,14 @@ func.func @push_down_unpack_through_expand(%5: tensor, %dim: index // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] output_shape [%[[SZ0]], 32, 32, 8, 8] : tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor -> tensor // CHECK: return %[[UNPACK]] : tensor // ----- func.func @push_down_unpack_through_expand_empty_outer_dims_perm(%5: tensor, %dim: index, %sz0: index) -> tensor { %6 = tensor.empty(%dim) : tensor - %unpack = tensor.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor -> tensor + %unpack = linalg.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor -> tensor %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor into tensor func.return %expanded : tensor } @@ -1354,14 +1354,14 @@ func.func @push_down_unpack_through_expand_empty_outer_dims_perm(%5: tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED:.+]] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor -> tensor // CHECK: return %[[UNPACK]] : tensor // ----- func.func @push_down_permuted_unpack_through_expand(%5: tensor<4x32x384x8x8xf32>) -> tensor<4x12x256x256xf32> { %6 = tensor.empty() : tensor<4x3072x256xf32> - %unpack = tensor.unpack %5 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 8] into %6 : tensor<4x32x384x8x8xf32> -> tensor<4x3072x256xf32> + %unpack = linalg.unpack %5 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 8] into %6 : tensor<4x32x384x8x8xf32> -> tensor<4x3072x256xf32> %expanded = tensor.expand_shape %unpack [[0], [1, 2], [3]] output_shape [4, 12, 256, 256] : tensor<4x3072x256xf32> into tensor<4x12x256x256xf32> func.return %expanded : tensor<4x12x256x256xf32> } @@ -1369,14 +1369,14 @@ func.func @push_down_permuted_unpack_through_expand(%5: tensor<4x32x384x8x8xf32> // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1], [2, 3], [4], [5]] output_shape [4, 32, 12, 32, 8, 8] : tensor<4x32x384x8x8xf32> into tensor<4x32x12x32x8x8xf32> // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<4x12x256x256xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<4x32x12x32x8x8xf32> -> tensor<4x12x256x256xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<4x32x12x32x8x8xf32> -> tensor<4x12x256x256xf32> // CHECK: return %[[UNPACK]] : tensor<4x12x256x256xf32> // ----- func.func @push_down_unpack_through_unit_expand(%5: tensor<6x32x8x8xf32>) -> tensor<3x16x1x256xf32> { %6 = tensor.empty() : tensor<48x256xf32> - %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<6x32x8x8xf32> -> tensor<48x256xf32> + %unpack = linalg.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<6x32x8x8xf32> -> tensor<48x256xf32> %expanded = tensor.expand_shape %unpack [[0, 1, 2], [3]] output_shape [3, 16, 1, 256] : tensor<48x256xf32> into tensor<3x16x1x256xf32> func.return %expanded : tensor<3x16x1x256xf32> } @@ -1384,14 +1384,14 @@ func.func @push_down_unpack_through_unit_expand(%5: tensor<6x32x8x8xf32>) -> ten // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3], [4], [5]] output_shape [3, 2, 1, 32, 8, 8] : tensor<6x32x8x8xf32> into tensor<3x2x1x32x8x8xf32> // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<3x16x1x256xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<3x2x1x32x8x8xf32> -> tensor<3x16x1x256xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<3x2x1x32x8x8xf32> -> tensor<3x16x1x256xf32> // CHECK: return %[[UNPACK]] : tensor<3x16x1x256xf32> // ----- func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor, %dim: index, %sz0: index) -> tensor { %6 = tensor.empty(%dim) : tensor - %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [8] into %6 : tensor -> tensor + %unpack = linalg.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [8] into %6 : tensor -> tensor %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor into tensor func.return %expanded : tensor } @@ -1405,19 +1405,19 @@ func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor, // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3]] output_shape [%[[SZ0]], 256, 32, 8] : tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [8] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [8] into %[[EMPTY]] : tensor -> tensor // CHECK: return %[[UNPACK]] : tensor // ----- func.func @no_push_down_unpack_through_non_divisible_expand(%5: tensor<384x32x8x8xf32>) -> tensor<256x12x256xf32> { %6 = tensor.empty() : tensor<3072x256xf32> - %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<384x32x8x8xf32> -> tensor<3072x256xf32> + %unpack = linalg.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<384x32x8x8xf32> -> tensor<3072x256xf32> %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [256, 12, 256] : tensor<3072x256xf32> into tensor<256x12x256xf32> func.return %expanded : tensor<256x12x256xf32> } // CHECK-LABEL: func.func @no_push_down_unpack_through_non_divisible_expand // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[UNPACK]] {{\[}}[0, 1], [2]] output_shape [256, 12, 256] : tensor<3072x256xf32> into tensor<256x12x256xf32> // CHECK: return %[[EXPANDED]] : tensor<256x12x256xf32> diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir index ec761d9a49436..72fde5490a305 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir @@ -4,7 +4,7 @@ // RUN: -transform-interpreter %s | FileCheck %s func.func @KCRS_to_KCRSsr(%arg0: tensor<1x1x128x64xf32>, %arg1: tensor<1x1x4x8x8x32xf32>) -> tensor<1x1x4x8x8x32xf32> { - %0 = tensor.pack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x128x64xf32> -> tensor<1x1x4x8x8x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x128x64xf32> -> tensor<1x1x4x8x8x32xf32> return %0 : tensor<1x1x4x8x8x32xf32> } // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 32)> @@ -27,7 +27,7 @@ func.func @KCRS_to_KCRSsr(%arg0: tensor<1x1x128x64xf32>, %arg1: tensor<1x1x4x8x8 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -36,7 +36,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @pad_and_pack(%arg0: tensor<13x15xf32>, %arg1: tensor<2x8x8x2xf32>, %arg2: f32) -> tensor<2x8x8x2xf32> { - %0 = tensor.pack %arg0 padding_value(%arg2 : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %arg1 : tensor<13x15xf32> -> tensor<2x8x8x2xf32> + %0 = linalg.pack %arg0 padding_value(%arg2 : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %arg1 : tensor<13x15xf32> -> tensor<2x8x8x2xf32> return %0 : tensor<2x8x8x2xf32> } // CHECK: func.func @pad_and_pack @@ -54,7 +54,7 @@ func.func @pad_and_pack(%arg0: tensor<13x15xf32>, %arg1: tensor<2x8x8x2xf32>, %a module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -64,7 +64,7 @@ module attributes {transform.with_named_sequence} { func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) -> tensor<32x4x32x8xf32> { - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<128x256xf32> -> tensor<32x4x32x8xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<128x256xf32> -> tensor<32x4x32x8xf32> return %0 : tensor<32x4x32x8xf32> } // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 32)> @@ -85,7 +85,7 @@ func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) // CHECK-SAME: [%[[C]], %[[K]], 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] : tensor<1x1x32x8xf32> into tensor<32x4x32x8xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir index 1cc1484ed4095..911b453f919c3 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir @@ -5,7 +5,7 @@ func.func @simple_KCRS_to_KCRSsr(%arg0: tensor, %arg1: tensor<1x1x?x1xi32>) -> tensor<1x1x?x1xi32> { %c8 = arith.constant 8 : index %c5 = arith.constant 5 : i32 - %pack = tensor.pack %arg0 padding_value(%c5 : i32) inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %arg1 : tensor -> tensor<1x1x?x1xi32> + %pack = linalg.pack %arg0 padding_value(%c5 : i32) inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %arg1 : tensor -> tensor<1x1x?x1xi32> return %pack : tensor<1x1x?x1xi32> } @@ -32,7 +32,7 @@ func.func @simple_KCRS_to_KCRSsr(%arg0: tensor, %arg1: tensor<1x1x?x1xi // ----- func.func @simple_pad_and_pack_static_tiles(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2xf32>, %pad: f32) -> tensor<1x1x8x2xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<5x1xf32> -> tensor<1x1x8x2xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<5x1xf32> -> tensor<1x1x8x2xf32> return %0 : tensor<1x1x8x2xf32> } // CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0] -> (s0 - 5)> @@ -52,7 +52,7 @@ func.func @simple_pad_and_pack_static_tiles(%input: tensor<5x1xf32>, %output: te /// Same as example above, but with 1 dynamic tile size. func.func @simple_pad_and_pack_dynamic_tile(%input: tensor<5x1xf32>, %output: tensor<1x1x?x2xf32>, %pad: f32, %tile_dim_0: index) -> tensor<1x1x?x2xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> return %0 : tensor<1x1x?x2xf32> } // CHECK-LABEL: func.func @simple_pad_and_pack_dynamic_tile( @@ -72,7 +72,7 @@ func.func @simple_pad_and_pack_dynamic_tile(%input: tensor<5x1xf32>, %output: te func.func @simple_pad_and_pack_dynamic_tile_cst(%input: tensor<5x1xf32>, %output: tensor<1x1x?x2xf32>, %pad: f32) -> tensor<1x1x?x2xf32> { %tile_dim_0 = arith.constant 8 : index - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> return %0 : tensor<1x1x?x2xf32> } // CHECK-LABEL: func.func @simple_pad_and_pack_dynamic_tile_cst( @@ -86,7 +86,7 @@ func.func @simple_pad_and_pack_dynamic_tile_cst(%input: tensor<5x1xf32>, %output // CHECK: return %[[RES]] : tensor<1x1x?x2xf32> func.func @simple_pad_and_pack_dynamic_tile_transpose(%input: tensor<5x1xf32>, %output: tensor<1x1x2x?xf32>, %pad: f32, %tile_dim_1: index) -> tensor<1x1x2x?xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [1, 0] inner_tiles = [2, %tile_dim_1] into %output : tensor<5x1xf32> -> tensor<1x1x2x?xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [1, 0] inner_tiles = [2, %tile_dim_1] into %output : tensor<5x1xf32> -> tensor<1x1x2x?xf32> return %0 : tensor<1x1x2x?xf32> } // CHECK-LABEL: func.func @simple_pad_and_pack_dynamic_tile_transpose( @@ -116,7 +116,7 @@ func.func @simple_pad_and_pack_scalable_tile(%input: tensor<5x1xf32>, %output: t %c8 = arith.constant 8 : index %vscale = vector.vscale %c8_vscale = arith.muli %vscale, %c8 : index - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> return %0 : tensor<1x1x?x2xf32> } @@ -138,7 +138,7 @@ func.func @simple_pad_and_pack_scalable_tile(%input: tensor<5x1xf32>, %output: t /// Same as example above, but with both tile sizes dynamic. func.func @simple_pad_and_pack_dynamic_tiles(%input: tensor<5x1xf32>, %output: tensor<1x1x?x?xf32>, %pad: f32, %tile_dim_0: index, %tile_dim_1: index) -> tensor<1x1x?x?xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, %tile_dim_1] into %output : tensor<5x1xf32> -> tensor<1x1x?x?xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, %tile_dim_1] into %output : tensor<5x1xf32> -> tensor<1x1x?x?xf32> return %0 : tensor<1x1x?x?xf32> } // CHECK-LABEL: func.func @simple_pad_and_pack_dynamic_tiles( @@ -158,7 +158,7 @@ func.func @simple_pad_and_pack_dynamic_tiles(%input: tensor<5x1xf32>, %output: t // ----- func.func @simple_pad_and_pack_dynamic_tile_not_all_dims_tiled(%input: tensor<1x1x5x1xf32>, %output: tensor<1x1x1x1x2x?xf32>, %pad: f32, %high: index) -> tensor<1x1x1x1x2x?xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) outer_dims_perm = [1, 0, 2, 3] inner_dims_pos = [3, 2] inner_tiles = [2, %high] into %output : tensor<1x1x5x1xf32> -> tensor<1x1x1x1x2x?xf32> + %0 = linalg.pack %input padding_value(%pad : f32) outer_dims_perm = [1, 0, 2, 3] inner_dims_pos = [3, 2] inner_tiles = [2, %high] into %output : tensor<1x1x5x1xf32> -> tensor<1x1x1x1x2x?xf32> return %0 : tensor<1x1x1x1x2x?xf32> } // CHECK: #[[$ATTR_2:.+]] = affine_map<()[s0] -> (s0 - 5)> @@ -183,7 +183,7 @@ func.func @simple_pad_and_pack_dynamic_tile_not_all_dims_tiled(%input: tensor<1x // ----- func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32>{ - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x8xf32> -> tensor<1x1x32x8xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x8xf32> -> tensor<1x1x32x8xf32> return %0 : tensor<1x1x32x8xf32> } // CHECK-LABEL: func.func @simple_NC_to_CNnc @@ -197,7 +197,7 @@ func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32 // ----- func.func @simple_CHW_to_CHWhwc(%arg0: tensor<3x5x7xf32>, %arg1: tensor<1x1x1x5x7x3xf32>) -> tensor<1x1x1x5x7x3xf32> { - %0 = tensor.pack %arg0 inner_dims_pos = [1, 2, 0] inner_tiles = [5, 7, 3] into %arg1 : tensor<3x5x7xf32> -> tensor<1x1x1x5x7x3xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1, 2, 0] inner_tiles = [5, 7, 3] into %arg1 : tensor<3x5x7xf32> -> tensor<1x1x1x5x7x3xf32> return %0 : tensor<1x1x1x5x7x3xf32> } // CHECK-LABEL: func.func @simple_CHW_to_CHWhwc @@ -215,7 +215,7 @@ func.func @simple_CHW_to_CHWhwc(%arg0: tensor<3x5x7xf32>, %arg1: tensor<1x1x1x5x // ----- func.func @simple_KCRS_to_KRSCsr(%arg0: tensor<1x1x32x8xf32>, %arg1: tensor<1x1x1x1x8x32xf32>) -> tensor<1x1x1x1x8x32xf32> { - %0 = tensor.pack %arg0 outer_dims_perm = [0, 2, 3, 1] inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x32x8xf32> -> tensor<1x1x1x1x8x32xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [0, 2, 3, 1] inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x32x8xf32> -> tensor<1x1x1x1x8x32xf32> return %0 : tensor<1x1x1x1x8x32xf32> } // CHECK-LABEL: func.func @simple_KCRS_to_KRSCsr diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir index 0dbdf470bbfc9..03437223f0d45 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir @@ -4,13 +4,13 @@ // RUN: -transform-interpreter %s | FileCheck %s func.func @KCRSsr_to_KCRS(%arg0: tensor<1x1x4x8x8x32xf32>, %arg1: tensor<1x1x128x64xf32>) -> tensor<1x1x128x64xf32> { - %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x4x8x8x32xf32> -> tensor<1x1x128x64xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x4x8x8x32xf32> -> tensor<1x1x128x64xf32> return %0 : tensor<1x1x128x64xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 32, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -38,7 +38,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @unpack_and_extract_slice(%arg0: tensor<2x8x8x2xf32>, %arg1: tensor<13x15xf32>) -> tensor<13x15xf32> { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %arg1 : tensor<2x8x8x2xf32> -> tensor<13x15xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %arg1 : tensor<2x8x8x2xf32> -> tensor<13x15xf32> return %0 : tensor<13x15xf32> } // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 8)> @@ -70,7 +70,7 @@ func.func @unpack_and_extract_slice(%arg0: tensor<2x8x8x2xf32>, %arg1: tensor<13 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [8, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -79,7 +79,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @CKkc_to_KC(%arg0: tensor<32x4x32x8xf32>, %arg1: tensor<128x256xf32>) -> tensor<128x256xf32> { - %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x4x32x8xf32> -> tensor<128x256xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x4x32x8xf32> -> tensor<128x256xf32> return %0 : tensor<128x256xf32> } // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> @@ -102,7 +102,7 @@ func.func @CKkc_to_KC(%arg0: tensor<32x4x32x8xf32>, %arg1: tensor<128x256xf32>) module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [32, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir index ba1f214952562..d460c506d6e18 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir @@ -3,7 +3,7 @@ // RUN: -transform-interpreter=entry-point=decompose_unpack %s | FileCheck %s func.func @simple_KCRSsr_to_KCRS(%arg0: tensor<1x1x1x1x8x32xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32> { - %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x1x1x8x32xf32> -> tensor<1x1x32x8xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x1x1x8x32xf32> -> tensor<1x1x32x8xf32> return %0 : tensor<1x1x32x8xf32> } // CHECK-LABEL: func.func @simple_KCRSsr_to_KCRS @@ -22,7 +22,7 @@ func.func @simple_KCRSsr_to_KCRS(%arg0: tensor<1x1x1x1x8x32xf32>, %arg1: tensor< // ----- func.func @simple_unpack_static_tiles(%input: tensor<1x1x8x2xf32>, %output: tensor<5x1xf32>) -> tensor<5x1xf32> { - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<1x1x8x2xf32> -> tensor<5x1xf32> + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<1x1x8x2xf32> -> tensor<5x1xf32> return %0 : tensor<5x1xf32> } // CHECK-LABEL: func.func @simple_unpack_static_tiles @@ -38,7 +38,7 @@ func.func @simple_unpack_static_tiles(%input: tensor<1x1x8x2xf32>, %output: tens /// Same as example above, but with 1 dynamic tile size. func.func @simple_unpack_dynamic_tile(%input: tensor<1x1x?x2xf32>, %output: tensor<5x1xf32>, %tile_dim: index) -> tensor<5x1xf32> { - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%tile_dim, 2] into %output : tensor<1x1x?x2xf32> -> tensor<5x1xf32> + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%tile_dim, 2] into %output : tensor<1x1x?x2xf32> -> tensor<5x1xf32> return %0 : tensor<5x1xf32> } // CHECK-LABEL: func.func @simple_unpack_dynamic_tile @@ -55,7 +55,7 @@ func.func @simple_unpack_dynamic_tile(%input: tensor<1x1x?x2xf32>, %output: tens /// Same as example above, but with 1 dynamic tile size and a trasnpose func.func @simple_unpack_dynamic_tile_transpose(%src: tensor<1x1x2x?xf32>, %dest: tensor<5x1xf32>, %tile_dim: index) -> tensor<5x1xf32> { - %0 = tensor.unpack %src inner_dims_pos = [1, 0] inner_tiles = [2, %tile_dim] into %dest : tensor<1x1x2x?xf32> -> tensor<5x1xf32> + %0 = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [2, %tile_dim] into %dest : tensor<1x1x2x?xf32> -> tensor<5x1xf32> return %0 : tensor<5x1xf32> } // CHECK-LABEL: func.func @simple_unpack_dynamic_tile_transpose @@ -78,7 +78,7 @@ func.func @simple_unpack_scalable_tile(%input: tensor<1x1x?x2xf32>, %output: ten %c8 = arith.constant 8 : index %vscale = vector.vscale %c8_vscale = arith.muli %vscale, %c8 : index - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 2] into %output : tensor<1x1x?x2xf32> -> tensor<5x1xf32> + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 2] into %output : tensor<1x1x?x2xf32> -> tensor<5x1xf32> return %0 : tensor<5x1xf32> } // CHECK-LABEL: func.func @simple_unpack_scalable_tile @@ -97,7 +97,7 @@ func.func @simple_unpack_scalable_tile(%input: tensor<1x1x?x2xf32>, %output: ten // ----- func.func @simple_CNnc_to_NC(%arg0: tensor<1x1x32x8xf32>, %arg1: tensor<32x8xf32>) -> tensor<32x8xf32>{ - %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<1x1x32x8xf32> -> tensor<32x8xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<1x1x32x8xf32> -> tensor<32x8xf32> return %0 : tensor<32x8xf32> } // CHECK-LABEL: func.func @simple_CNnc_to_NC @@ -112,7 +112,7 @@ func.func @simple_CNnc_to_NC(%arg0: tensor<1x1x32x8xf32>, %arg1: tensor<32x8xf32 // ----- func.func @simple_NCHWc_to_NCHW(%arg0: tensor<2x1x16x8x32xf32>, %arg1: tensor<2x32x16x8xf32>) -> tensor<2x32x16x8xf32> { - %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %arg1 : tensor<2x1x16x8x32xf32> -> tensor<2x32x16x8xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %arg1 : tensor<2x1x16x8x32xf32> -> tensor<2x32x16x8xf32> return %0 : tensor<2x32x16x8xf32> } // CHECK-LABEL: func.func @simple_NCHWc_to_NCHW @@ -131,7 +131,7 @@ func.func @simple_NCHWc_to_NCHW(%arg0: tensor<2x1x16x8x32xf32>, %arg1: tensor<2x // ----- func.func @simple_NHWC_to_NCHW(%arg0: tensor<1x16x8x32xf32>, %arg1: tensor<1x32x16x8xf32>) -> tensor<1x32x16x8xf32> { - %0 = tensor.unpack %arg0 outer_dims_perm = [0, 2, 3, 1] inner_dims_pos = [] inner_tiles = [] into %arg1 : tensor<1x16x8x32xf32> -> tensor<1x32x16x8xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [0, 2, 3, 1] inner_dims_pos = [] inner_tiles = [] into %arg1 : tensor<1x16x8x32xf32> -> tensor<1x32x16x8xf32> return %0 : tensor<1x32x16x8xf32> } // CHECK-LABEL: func.func @simple_NHWC_to_NCHW @@ -150,7 +150,7 @@ func.func @simple_NHWC_to_NCHW(%arg0: tensor<1x16x8x32xf32>, %arg1: tensor<1x32x // ----- func.func @unpack_with_dynamic_dims(%arg0: tensor, %arg1: tensor) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor -> tensor + %0 = linalg.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor -> tensor return %0 : tensor } // CHECK-LABEL: func.func @unpack_with_dynamic_dims diff --git a/mlir/test/Dialect/Linalg/fold-empty-op.mlir b/mlir/test/Dialect/Linalg/fold-empty-op.mlir new file mode 100644 index 0000000000000..5ce19d7091318 --- /dev/null +++ b/mlir/test/Dialect/Linalg/fold-empty-op.mlir @@ -0,0 +1,82 @@ +// RUN: mlir-opt -split-input-file -transform-interpreter %s | FileCheck %s + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) { + %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func"> + transform.apply_patterns to %func_op { + transform.apply_patterns.linalg.fold_pack_unpack_into_empty + } : !transform.op<"func.func"> + transform.yield + } +} + +func.func @pack_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> { + %empty_unpacked = tensor.empty() : tensor<256x256xf32> + %packed = linalg.pack %empty_unpacked + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32> + return %packed : tensor<8x8x32x32xf32> +} + +// CHECK-LABEL: func.func @pack_empty( +// CHECK-SAME: %[[T:.+]]: tensor<8x8x32x32xf32> +// CHECK-NOT: linalg.pack +// CHECK: return %[[T]] : tensor<8x8x32x32xf32> + +func.func @pack_empty_dynamic(%arg0: tensor, %dim0: index, %dim1: index) -> tensor { + %empty_unpacked = tensor.empty(%dim0, %dim1) : tensor + %packed = linalg.pack %empty_unpacked + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor -> tensor + return %packed : tensor +} + +// CHECK-LABEL: func.func @pack_empty_dynamic( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK-SAME: %[[DIM0:[a-zA-Z0-9_]+]]: index, +// CHECK-SAME: %[[DIM1:[a-zA-Z0-9_]+]]: index +// CHECK-NOT: linalg.pack +// CHECK: return %[[T]] : tensor + +func.func @unpack_empty(%arg0: tensor<256x256xf32>) -> tensor<256x256xf32> { + %empty_packed = tensor.empty() : tensor<8x8x32x32xf32> + %unpacked = linalg.unpack %empty_packed + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor<8x8x32x32xf32> -> tensor<256x256xf32> + return %unpacked : tensor<256x256xf32> +} + +// CHECK-LABEL: func.func @unpack_empty( +// CHECK-SAME: %[[T:.+]]: tensor<256x256xf32> +// CHECK-NOT: linalg.unpack +// CHECK: return %[[T]] : tensor<256x256xf32> + +func.func @unpack_empty_dynamic(%arg0: tensor, %dim0: index, %dim1: index) -> tensor { + %empty_packed = tensor.empty(%dim0, %dim1) : tensor + %unpacked = linalg.unpack %empty_packed + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor -> tensor + return %unpacked : tensor +} + +// CHECK-LABEL: func.func @unpack_empty_dynamic( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK-SAME: %[[DIM0:[a-zA-Z0-9_]+]]: index, +// CHECK-SAME: %[[DIM1:[a-zA-Z0-9_]+]]: index +// CHECK-NOT: linalg.unpack +// CHECK: return %[[T]] : tensor + +func.func @pack_padded_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> { + %pad = arith.constant 1.0 : f32 + %empty_unpacked = tensor.empty() : tensor<256x256xf32> + %packed = linalg.pack %empty_unpacked + padding_value(%pad : f32) + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32> + return %packed : tensor<8x8x32x32xf32> +} + +// CHECK-LABEL: func.func @pack_padded_empty( +// CHECK-SAME: %[[T:.+]]: tensor<8x8x32x32xf32> +// CHECK: %[[PACK:.+]] = linalg.pack +// CHECK: return %[[PACK]] : tensor<8x8x32x32xf32> diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index cff741e75077e..f2283db8f89b2 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -1284,6 +1284,7 @@ func.func @indexing_map_size_one_batch_matmul(%arg0: memref, ins(%arg0, %arg1 : memref, memref) outs(%arg2: memref) return + } // ----- @@ -1459,3 +1460,187 @@ func.func @invalid_C_map_result_dim_batch_matmul(%arg0: memref, %arg1 outs(%arg2: memref) return } + + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.pack +//===----------------------------------------------------------------------===// + +func.func @pack_invalid_no_padding_no_full_tiles(%input: tensor<256x128xf32>, %output: tensor<8x8x16x33xf32>) -> tensor<8x8x16x33xf32> { + // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} + %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 33] into %output : tensor<256x128xf32> -> tensor<8x8x16x33xf32> + return %0 : tensor<8x8x16x33xf32> +} + +// ----- + +func.func @pack_invalid_no_padding_no_full_tiles_dyn_tiles(%input: tensor<256x128xf32>, %output: tensor<10x8x?x?xf32>, %tile_size_0: index, %tile_size_1: index) -> tensor<10x8x?x?xf32> { + // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} + %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [%tile_size_0, %tile_size_1] into %output : tensor<256x128xf32> -> tensor<10x8x?x?xf32> + return %0 : tensor<10x8x?x?xf32> +} + +// ----- + +func.func @pack_invalid_no_padding_no_full_tiles_dyn_tiles_outperm(%input: tensor<256x128xf32>, %output: tensor<8x10x?x?xf32>, %tile_size_0: index, %tile_size_1: index) -> tensor<8x10x?x?xf32> { + // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} + %0 = linalg.pack %input outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [%tile_size_0, %tile_size_1] into %output : tensor<256x128xf32> -> tensor<8x10x?x?xf32> + return %0 : tensor<8x10x?x?xf32> +} + +// ----- + +func.func @pad_and_pack_invalid_type(%input: tensor<13x15xf32>, %output: tensor<2x8x8x2xf32>, %pad: i32) -> tensor<2x8x8x2xf32> { + // expected-error@+1 {{expected padding_value has 'f32' but got: 'i32'}} + %0 = linalg.pack %input padding_value(%pad: i32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<13x15xf32> -> tensor<2x8x8x2xf32> + return %0 : tensor<2x8x8x2xf32> +} + +// ----- + +func.func @pack_invalid_inner_dims_pos_vector(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid inner_dims_pos vector}} + %0 = linalg.pack %input inner_dims_pos = [2, 0] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- + +func.func @pack_invalid_duplicate_element_in_inner_dims(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid inner_dims_pos vector}} + %0 = linalg.pack %input inner_dims_pos = [1, 1] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- + +func.func @pack_invalid_duplicate_element_in_outer_perm(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid outer_dims_perm vector}} + %0 = linalg.pack %input outer_dims_perm = [1, 1] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- + +func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<64x32x16xf32> { + // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} + %0 = linalg.pack %input inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %output : tensor<256x128xf32> -> tensor<64x32x16xf32> + return %0 : tensor<64x32x16xf32> +} + +// ----- + +func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid zero tile factor}} + %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [0, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- +func.func @pack_mismatch_inner_tile_size_and_output_shape( + %input : tensor, %output : tensor) -> tensor { + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @pack_dynamic_inner_tile_size_and_static_output_shape( + %input : tensor, %output : tensor) -> tensor { + %c8 = arith.constant 8 : index + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, %c8] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @pack_static_inner_tile_size_and_dynamic_output_shape( + %input : tensor, %output : tensor) -> tensor { + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @pack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<16x4x32x16xf32> { + // expected-error@+1 {{outer_dims_perm must be a permutation or empty}} + %0 = linalg.pack %source outer_dims_perm = [0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<16x4x32x16xf32> + return %0 : tensor<16x4x32x16xf32> +} + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.unpack +//===----------------------------------------------------------------------===// + +func.func @unpack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> { + // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} + %0 = linalg.unpack %output inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %input : tensor<64x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> +} + +// ----- + +func.func @unpack_invalid_out_of_bound_outer_perm(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid outer_dims_perm vector}} + %0 = linalg.unpack %output outer_dims_perm = [2, 1] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %input : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> +} + +// ----- + +func.func @unpack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<128x256xf32> { + // expected-error@+1 {{outer_dims_perm must be a permutation or empty}} + %0 = linalg.unpack %dest outer_dims_perm = [1] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<16x4x32x16xf32> -> tensor<128x256xf32> + return %0 : tensor<128x256xf32> +} + +// ----- + +func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x8x16x32xf32>', got 'tensor<8x8x32x16xf32>'}} + %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- + +func.func @unpack_invalid(%output: tensor<256x128xf32>, %input: tensor<8x8x32x16xf32>) -> tensor<256x128xf32> { + // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x32x4x32xf32>', got 'tensor<8x8x32x16xf32>'}} + %0 = linalg.unpack %input inner_dims_pos = [1, 0] inner_tiles = [4, 32] into %output : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> +} + +// ----- + +func.func @unpack_mismatch_inner_tile_size_and_output_shape( + %input : tensor, %output : tensor) -> tensor { + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @unpack_dynamic_inner_tile_size_and_static_output_shape( + %input : tensor, %output : tensor) -> tensor { + %c8 = arith.constant 8 : index + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%c8, 4] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @unpack_static_inner_tile_size_and_dynamic_output_shape( + %input : tensor, %output : tensor) -> tensor { + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor + return %0 : tensor +} diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 68ea97be911a6..8474eeac0db5b 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -2520,3 +2520,108 @@ func.func @select_tensor(%arg0: tensor<4x8x16xi1>, %arg1: tensor<4x8x16xf32>, %a %1 = linalg.select ins(%arg0, %arg1, %arg2 : tensor<4x8x16xi1>, tensor<4x8x16xf32>, tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> return %1 : tensor<4x8x16xf32> } + +//===----------------------------------------------------------------------===// +// linalg.pack + linalg.unpack +//===----------------------------------------------------------------------===// + +func.func @pack_nc_to_ncnc(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) -> tensor<128x256xf32> { + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %1 = tensor.empty() : tensor<128x256xf32> + %2 = linalg.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<4x16x32x16xf32> -> tensor<128x256xf32> + return %2 : tensor<128x256xf32> +} + +// CHECK-LABEL: func.func @pack_nc_to_ncnc( +// CHECK-SAME: %[[SOURCE:.*]]: tensor<128x256xf32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<4x16x32x16xf32>) +// CHECK: %[[PACKED:.*]] = linalg.pack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[DEST]] : tensor<128x256xf32> -> tensor<4x16x32x16xf32> +// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<128x256xf32> +// CHECK: %{{.*}} = linalg.unpack %[[PACKED]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[BUFF]] : tensor<4x16x32x16xf32> -> tensor<128x256xf32> + +// ----- + +func.func @pack_nc_to_ncnc_with_padding(%source: tensor<13x15xf32>, %dest: tensor<2x8x8x2xf32>, %padding: f32) -> tensor<13x15xf32> { + %0 = linalg.pack %source padding_value(%padding : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor<13x15xf32> -> tensor<2x8x8x2xf32> + %1 = tensor.empty() : tensor<13x15xf32> + %2 = linalg.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %1 : tensor<2x8x8x2xf32> -> tensor<13x15xf32> + return %2 : tensor<13x15xf32> +} + +// CHECK-LABEL: func.func @pack_nc_to_ncnc_with_padding( +// CHECK-SAME: %[[SOURCE:.*]]: tensor<13x15xf32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<2x8x8x2xf32>, +// CHECK-SAME: %[[PADDING:.*]]: f32) +// CHECK: %[[PACKED:.*]] = linalg.pack %[[SOURCE]] padding_value(%[[PADDING]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor<13x15xf32> -> tensor<2x8x8x2xf32> +// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<13x15xf32> +// CHECK: %{{.*}} = linalg.unpack %[[PACKED]] inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[BUFF]] : tensor<2x8x8x2xf32> -> tensor<13x15xf32> + +// ----- + +func.func @pack_ck_to_kcck(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<128x256xf32> { + %0 = linalg.pack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<16x4x32x16xf32> + %1 = tensor.empty() : tensor<128x256xf32> + %2 = linalg.unpack %0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<16x4x32x16xf32> -> tensor<128x256xf32> + return %2 : tensor<128x256xf32> +} + +// CHECK-LABEL: func.func @pack_ck_to_kcck( +// CHECK-SAME: %[[SOURCE:.*]]: tensor<128x256xf32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<16x4x32x16xf32>) +// CHECK: %[[PACKED:.*]] = linalg.pack %[[SOURCE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[DEST]] : tensor<128x256xf32> -> tensor<16x4x32x16xf32> +// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<128x256xf32> +// CHECK: %{{.*}} = linalg.unpack %[[PACKED]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[BUFF]] : tensor<16x4x32x16xf32> -> tensor<128x256xf32> + +// ----- + +func.func @pad_and_pack_fully_dynamic(%source: tensor, %dest: tensor, %pad: f32, %tile_n : index, %tile_m : index) -> tensor { + %0 = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func.func @pad_and_pack_fully_dynamic( +// CHECK-SAME: %[[SOURCE:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor, +// CHECK-SAME: %[[PAD:.*]]: f32, +// CHECK-SAME: %[[TILE_N:.*]]: index, +// CHECK-SAME: %[[TILE_M:.*]]: index) +// CHECK: %{{.*}} = linalg.pack %[[SOURCE]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_N]], %[[TILE_M]]] into %[[DEST]] : tensor -> tensor + +// ----- + +func.func @pad_and_pack_partially_dynamic(%source: tensor, %dest: tensor, %pad: f32) -> tensor { + %0 = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func.func @pad_and_pack_partially_dynamic( +// CHECK-SAME: %[[SOURCE:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor, +// CHECK-SAME: %[[PAD:.*]]: f32) +// CHECK: %{{.*}} = linalg.pack %[[SOURCE]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor -> tensor + +// ----- + +func.func @unpack_fully_dynamic(%source: tensor, %dest: tensor, %tile_n : index, %tile_m : index) -> tensor { + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func.func @unpack_fully_dynamic( +// CHECK-SAME: %[[SOURCE:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor, +// CHECK-SAME: %[[TILE_N:.*]]: index, +// CHECK-SAME: %[[TILE_M:.*]]: index) +// CHECK: %{{.*}} = linalg.unpack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_N]], %[[TILE_M]]] into %[[DEST]] : tensor -> tensor + +// ----- + +func.func @unpack_partially_dynamic(%source: tensor, %dest: tensor) -> tensor { + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor + return %0: tensor +} + +// CHECK-LABEL: func.func @unpack_partially_dynamic( +// CHECK-SAME: %[[SOURCE:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor) +// CHECK: %{{.*}} = linalg.unpack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor -> tensor diff --git a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir similarity index 86% rename from mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir rename to mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir index f9e51ae52a74b..51350e5bc8498 100644 --- a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir +++ b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -split-input-file -test-tensor-transform-patterns="test-simplify-pack-unpack-patterns" %s | FileCheck %s +// RUN: mlir-opt -split-input-file -test-linalg-transform-patterns="test-simplify-pack-unpack-patterns" %s | FileCheck %s // CHECK-LABEL: func.func @single_dim_packing( // CHECK-SAME: %[[ARG0:.+]]: tensor<256xf32>) @@ -6,7 +6,7 @@ // CHECK: return %[[EXPANDED]] : tensor<8x32xf32> func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> { %empty = tensor.empty() : tensor<8x32xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256xf32> -> tensor<8x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256xf32> -> tensor<8x32xf32> return %0 : tensor<8x32xf32> } @@ -15,11 +15,11 @@ func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> { // CHECK-LABEL: func.func @single_dim_packing_with_padding( // CHECK-SAME: %[[ARG0:.+]]: tensor<255xf32>) // CHECK-NOT: tensor.expand_shape -// CHECK: tensor.pack +// CHECK: linalg.pack func.func @single_dim_packing_with_padding(%arg0: tensor<255xf32>) -> tensor<8x32xf32> { %empty = tensor.empty() : tensor<8x32xf32> %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.pack %arg0 padding_value(%cst : f32) inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<255xf32> -> tensor<8x32xf32> + %0 = linalg.pack %arg0 padding_value(%cst : f32) inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<255xf32> -> tensor<8x32xf32> return %0 : tensor<8x32xf32> } @@ -31,7 +31,7 @@ func.func @single_dim_packing_with_padding(%arg0: tensor<255xf32>) -> tensor<8x3 // CHECK: return %[[EXPANDED]] : tensor<5x8x32xf32> func.func @single_last_inner_dim_packing(%arg0: tensor<5x256xf32>) -> tensor<5x8x32xf32> { %empty = tensor.empty() : tensor<5x8x32xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32> return %0 : tensor<5x8x32xf32> } @@ -43,7 +43,7 @@ func.func @single_last_inner_dim_packing(%arg0: tensor<5x256xf32>) -> tensor<5x8 // CHECK: return %[[EXPANDED]] : tensor<2x32xf32> func.func @pack_1d_with_outer_dims_perm(%arg0: tensor<64xf32>) -> tensor<2x32xf32> { %empty = tensor.empty() : tensor<2x32xf32> - %pack = tensor.pack %arg0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<64xf32> -> tensor<2x32xf32> + %pack = linalg.pack %arg0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<64xf32> -> tensor<2x32xf32> return %pack : tensor<2x32xf32> } @@ -55,7 +55,7 @@ func.func @pack_1d_with_outer_dims_perm(%arg0: tensor<64xf32>) -> tensor<2x32xf3 // CHECK: return %[[EXPANDED]] : tensor<5x8x32xf32> func.func @single_last_inner_dim_packing_with_identity_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<5x8x32xf32> { %empty = tensor.empty() : tensor<5x8x32xf32> - %0 = tensor.pack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32> return %0 : tensor<5x8x32xf32> } @@ -63,10 +63,10 @@ func.func @single_last_inner_dim_packing_with_identity_outer_dims_perm(%arg0: te // CHECK-LABEL: func.func @packing_with_outer_dims_perm( // CHECK-NOT: tensor.expand_shape -// CHECK: tensor.pack +// CHECK: linalg.pack func.func @packing_with_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<8x5x32xf32> { %empty = tensor.empty() : tensor<8x5x32xf32> - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<8x5x32xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<8x5x32xf32> return %0 : tensor<8x5x32xf32> } @@ -74,10 +74,10 @@ func.func @packing_with_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<8x5x // CHECK-LABEL: func.func @single_first_inner_dim_packing( // CHECK-NOT: tensor.expand_shape -// CHECK: tensor.pack +// CHECK: linalg.pack func.func @single_first_inner_dim_packing(%arg0: tensor<256x5xf32>) -> tensor<8x5x32xf32> { %empty = tensor.empty() : tensor<8x5x32xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256x5xf32> -> tensor<8x5x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256x5xf32> -> tensor<8x5x32xf32> return %0 : tensor<8x5x32xf32> } @@ -89,7 +89,7 @@ func.func @single_first_inner_dim_packing(%arg0: tensor<256x5xf32>) -> tensor<8x // CHECK: return %[[EXPANDED]] func.func @pack_1x32_to_1x32x1x1(%arg0 : tensor<1x32xf32>) -> tensor<1x32x1x1xf32> { %empty = tensor.empty() : tensor<1x32x1x1xf32> - %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %empty + %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %empty : tensor<1x32xf32> -> tensor<1x32x1x1xf32> return %pack : tensor<1x32x1x1xf32> } @@ -102,7 +102,7 @@ func.func @pack_1x32_to_1x32x1x1(%arg0 : tensor<1x32xf32>) -> tensor<1x32x1x1xf3 // CHECK: return %[[EXPANDED]] func.func @pack_1x32_to_1x16x1x2(%arg0 : tensor<1x32xf32>) -> tensor<1x16x1x2xf32> { %empty = tensor.empty() : tensor<1x16x1x2xf32> - %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 2] into %empty + %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 2] into %empty : tensor<1x32xf32> -> tensor<1x16x1x2xf32> return %pack : tensor<1x16x1x2xf32> } @@ -115,7 +115,7 @@ func.func @pack_1x32_to_1x16x1x2(%arg0 : tensor<1x32xf32>) -> tensor<1x16x1x2xf3 // CHECK: return %[[EXPANDED]] func.func @pack_32x1_to_16x1x2x1(%arg0 : tensor<32x1xf32>) -> tensor<1x16x2x1xf32> { %empty = tensor.empty() : tensor<1x16x2x1xf32> - %pack = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 1] into %empty + %pack = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 1] into %empty : tensor<32x1xf32> -> tensor<1x16x2x1xf32> return %pack : tensor<1x16x2x1xf32> } @@ -124,10 +124,10 @@ func.func @pack_32x1_to_16x1x2x1(%arg0 : tensor<32x1xf32>) -> tensor<1x16x2x1xf3 // CHECK-LABEL: func.func @pack_32x1_to_16x1x1x2 // CHECK-NOT: tensor.expand_shape -// CHECK: tensor.pack +// CHECK: linalg.pack func.func @pack_32x1_to_16x1x1x2(%arg0 : tensor<32x1xf32>) -> tensor<16x1x1x2xf32> { %empty = tensor.empty() : tensor<16x1x1x2xf32> - %pack = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [1, 2] into %empty + %pack = linalg.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [1, 2] into %empty : tensor<32x1xf32> -> tensor<16x1x1x2xf32> return %pack : tensor<16x1x1x2xf32> } @@ -140,7 +140,7 @@ func.func @pack_32x1_to_16x1x1x2(%arg0 : tensor<32x1xf32>) -> tensor<16x1x1x2xf3 // CHECK: return %[[COLLAPSED]] func.func @unpack_1d_to_collapse(%arg0: tensor<8x32xf32>) -> tensor<256xf32> { %empty = tensor.empty() : tensor<256xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<256xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<256xf32> return %0 : tensor<256xf32> } @@ -148,10 +148,10 @@ func.func @unpack_1d_to_collapse(%arg0: tensor<8x32xf32>) -> tensor<256xf32> { // CHECK-LABEL: func.func @unpack_to_partial_slice // CHECK-NOT: tensor.collapse -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> { %empty = tensor.empty() : tensor<255xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<255xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<255xf32> return %0 : tensor<255xf32> } @@ -159,14 +159,14 @@ func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> { // CHECK-LABEL: func.func @unpack_dynamic // CHECK-NOT: tensor.collapse -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @unpack_dynamic(%arg0: tensor) -> tensor { %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index %d0 = tensor.dim %arg0, %c0 : tensor %size = arith.muli %d0, %c32 : index %empty = tensor.empty(%size) : tensor - %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor -> tensor + %0 = linalg.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor -> tensor return %0 : tensor } @@ -178,7 +178,7 @@ func.func @unpack_dynamic(%arg0: tensor) -> tensor { // CHECK: return %[[COLLAPSED]] : tensor<5x256xf32> func.func @single_last_inner_dim_unpacking(%arg0: tensor<5x8x32xf32>) -> tensor<5x256xf32> { %empty = tensor.empty() : tensor<5x256xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32> return %0 : tensor<5x256xf32> } @@ -190,7 +190,7 @@ func.func @single_last_inner_dim_unpacking(%arg0: tensor<5x8x32xf32>) -> tensor< // CHECK: return %[[COLLAPSED]] : tensor<5x256xf32> func.func @single_last_inner_dim_unpacking_with_identity_outer_dims_perm(%arg0: tensor<5x8x32xf32>) -> tensor<5x256xf32> { %empty = tensor.empty() : tensor<5x256xf32> - %0 = tensor.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32> return %0 : tensor<5x256xf32> } @@ -198,10 +198,10 @@ func.func @single_last_inner_dim_unpacking_with_identity_outer_dims_perm(%arg0: // CHECK-LABEL: func.func @unpacking_with_outer_dims_perm( // CHECK-NOT: tensor.collpase_shape -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @unpacking_with_outer_dims_perm(%arg0: tensor<8x5x32xf32>) -> tensor<5x256xf32> { %empty = tensor.empty() : tensor<5x256xf32> - %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<5x256xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<5x256xf32> return %0 : tensor<5x256xf32> } @@ -209,10 +209,10 @@ func.func @unpacking_with_outer_dims_perm(%arg0: tensor<8x5x32xf32>) -> tensor<5 // CHECK-LABEL: func.func @single_first_inner_dim_unpacking( // CHECK-NOT: tensor.collapse_shape -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @single_first_inner_dim_unpacking(%arg0: tensor<8x5x32xf32>) -> tensor<256x5xf32> { %empty = tensor.empty() : tensor<256x5xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<256x5xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<256x5xf32> return %0 : tensor<256x5xf32> } @@ -224,7 +224,7 @@ func.func @single_first_inner_dim_unpacking(%arg0: tensor<8x5x32xf32>) -> tensor // CHECK: return %[[COLLAPSED]] func.func @unpack_1x32x1x1_to_1x32(%arg0 : tensor<1x32x1x1xf32>) -> tensor<1x32xf32> { %empty = tensor.empty() : tensor<1x32xf32> - %unpack = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %empty + %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %empty : tensor<1x32x1x1xf32> -> tensor<1x32xf32> return %unpack : tensor<1x32xf32> } @@ -237,7 +237,7 @@ func.func @unpack_1x32x1x1_to_1x32(%arg0 : tensor<1x32x1x1xf32>) -> tensor<1x32x // CHECK: return %[[COLLAPSED]] func.func @unpack_1x2x1x16_to_1x32(%arg0 : tensor<1x2x1x16xf32>) -> tensor<1x32xf32> { %empty = tensor.empty() : tensor<1x32xf32> - %unpack = tensor.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 16] into %empty + %unpack = linalg.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 16] into %empty : tensor<1x2x1x16xf32> -> tensor<1x32xf32> return %unpack : tensor<1x32xf32> } @@ -250,7 +250,7 @@ func.func @unpack_1x2x1x16_to_1x32(%arg0 : tensor<1x2x1x16xf32>) -> tensor<1x32x // CHECK: return %[[COLLAPSED]] func.func @unpack_16x1x2x1_to_32x1(%arg0 : tensor<1x16x2x1xf32>) -> tensor<32x1xf32> { %empty = tensor.empty() : tensor<32x1xf32> - %unpack = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 1] into %empty + %unpack = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 1] into %empty : tensor<1x16x2x1xf32> -> tensor<32x1xf32> return %unpack : tensor<32x1xf32> } @@ -259,10 +259,10 @@ func.func @unpack_16x1x2x1_to_32x1(%arg0 : tensor<1x16x2x1xf32>) -> tensor<32x1x // CHECK-LABEL: func.func @unpack_16x1x1x2_to_32x1 // CHECK-NOT: tensor.collapse_shape -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @unpack_16x1x1x2_to_32x1(%arg0 : tensor<16x1x1x2xf32>) -> tensor<32x1xf32> { %empty = tensor.empty() : tensor<32x1xf32> - %unpack = tensor.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [1, 2] into %empty + %unpack = linalg.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [1, 2] into %empty : tensor<16x1x1x2xf32> -> tensor<32x1xf32> return %unpack : tensor<32x1xf32> } @@ -275,7 +275,7 @@ func.func @unpack_16x1x1x2_to_32x1(%arg0 : tensor<16x1x1x2xf32>) -> tensor<32x1x // CHECK: return %[[EXPANDED]] : tensor<1x1x32x64xf32> func.func @pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> { %empty = tensor.empty() : tensor<1x1x32x64xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32> return %0 : tensor<1x1x32x64xf32> } @@ -287,7 +287,7 @@ func.func @pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> { // CHECK: return %[[EXPANDED]] : tensor<1x1x32x64xf32> func.func @pad_like_pack_with_outer_dims_perm(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> { %empty = tensor.empty() : tensor<1x1x32x64xf32> - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32> return %0 : tensor<1x1x32x64xf32> } @@ -299,7 +299,7 @@ func.func @pad_like_pack_with_outer_dims_perm(%arg0: tensor<32x64xf32>) -> tenso // CHECK: return %[[EXPANDED]] : tensor<32x1x64xf32> func.func @inner_pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<32x1x64xf32> { %empty = tensor.empty() : tensor<32x1x64xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64xf32> -> tensor<32x1x64xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64xf32> -> tensor<32x1x64xf32> return %0 : tensor<32x1x64xf32> } @@ -309,11 +309,11 @@ func.func @inner_pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<32x1x64xf32> // CHECK-LABEL: func.func @pad_and_inner_dim_shuffle_pack( // CHECK-SAME: %[[ARG0:.+]]: tensor<32x64xf32>) // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x1x64x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %[[EMPTY]] : tensor<32x64xf32> -> tensor<1x1x64x32xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %[[EMPTY]] : tensor<32x64xf32> -> tensor<1x1x64x32xf32> // CHECK: return %[[PACK]] : tensor<1x1x64x32xf32> func.func @pad_and_inner_dim_shuffle_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x64x32xf32> { %empty = tensor.empty() : tensor<1x1x64x32xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %empty : tensor<32x64xf32> -> tensor<1x1x64x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %empty : tensor<32x64xf32> -> tensor<1x1x64x32xf32> return %0 : tensor<1x1x64x32xf32> } @@ -323,11 +323,11 @@ func.func @pad_and_inner_dim_shuffle_pack(%arg0: tensor<32x64xf32>) -> tensor<1x // CHECK-LABEL: func.func @pad_like_pack_with_transpose( // CHECK-SAME: %[[ARG0:.+]]: tensor<32x64x16xf32>) // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x1x16x64xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32> // CHECK: return %[[PACK]] : tensor<32x1x16x64xf32> func.func @pad_like_pack_with_transpose(%arg0: tensor<32x64x16xf32>) -> tensor<32x1x16x64xf32> { %empty = tensor.empty() : tensor<32x1x16x64xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32> return %0 : tensor<32x1x16x64xf32> } @@ -339,7 +339,7 @@ func.func @pad_like_pack_with_transpose(%arg0: tensor<32x64x16xf32>) -> tensor<3 // CHECK: return %[[COLLAPSED]] : tensor<32x64xf32> func.func @unpad_like_unpack(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> { %empty = tensor.empty() : tensor<32x64xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32> return %0 : tensor<32x64xf32> } @@ -351,7 +351,7 @@ func.func @unpad_like_unpack(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> // CHECK: return %[[COLLAPSED]] : tensor<32x64xf32> func.func @unpad_like_unpack_with_outer_dims_perm(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> { %empty = tensor.empty() : tensor<32x64xf32> - %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32> return %0 : tensor<32x64xf32> } @@ -363,7 +363,7 @@ func.func @unpad_like_unpack_with_outer_dims_perm(%arg0: tensor<1x1x32x64xf32>) // CHECK: return %[[COLLAPSED]] : tensor<32x64xf32> func.func @inner_unpad_like_unpack(%arg0: tensor<32x1x64xf32>) -> tensor<32x64xf32> { %empty = tensor.empty() : tensor<32x64xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x64xf32> -> tensor<32x64xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x64xf32> -> tensor<32x64xf32> return %0 : tensor<32x64xf32> } @@ -373,11 +373,11 @@ func.func @inner_unpad_like_unpack(%arg0: tensor<32x1x64xf32>) -> tensor<32x64xf // CHECK-LABEL: func.func @unpad_and_inner_dim_shuffle_pack( // CHECK-SAME: %[[ARG0:.+]]: tensor<1x1x32x64xf32>) // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<64x32xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %[[EMPTY]] : tensor<1x1x32x64xf32> -> tensor<64x32xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %[[EMPTY]] : tensor<1x1x32x64xf32> -> tensor<64x32xf32> // CHECK: return %[[UNPACK]] : tensor<64x32xf32> func.func @unpad_and_inner_dim_shuffle_pack(%arg0: tensor<1x1x32x64xf32>) -> tensor<64x32xf32> { %empty = tensor.empty() : tensor<64x32xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<64x32xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<64x32xf32> return %0 : tensor<64x32xf32> } @@ -387,10 +387,10 @@ func.func @unpad_and_inner_dim_shuffle_pack(%arg0: tensor<1x1x32x64xf32>) -> ten // CHECK-LABEL: func.func @unpad_like_unpack_with_transpose( // CHECK-SAME: %[[ARG0:.+]]: tensor<32x1x16x64xf32>) // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x64x16xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32> // CHECK: return %[[UNPACK]] : tensor<32x64x16xf32> func.func @unpad_like_unpack_with_transpose(%arg0: tensor<32x1x16x64xf32>) -> tensor<32x64x16xf32> { %empty = tensor.empty() : tensor<32x64x16xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32> return %0 : tensor<32x64x16xf32> } diff --git a/mlir/test/Dialect/Linalg/td/decompose-pack.mlir b/mlir/test/Dialect/Linalg/td/decompose-pack.mlir index 49c45e29d5a14..32054134266c7 100644 --- a/mlir/test/Dialect/Linalg/td/decompose-pack.mlir +++ b/mlir/test/Dialect/Linalg/td/decompose-pack.mlir @@ -1,6 +1,6 @@ module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @decompose_pack(%module: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %pack {isolated_from_above} : (!transform.any_op) -> !transform.any_op transform.apply_patterns to %1 { diff --git a/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir index 11243634262e0..f5b8403af5e58 100644 --- a/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir +++ b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir @@ -1,6 +1,6 @@ module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @decompose_unpack(%module: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.unpack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.unpack"]} in %module : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %pack {isolated_from_above} : (!transform.any_op) -> !transform.any_op transform.apply_patterns to %1 { diff --git a/mlir/test/Dialect/Linalg/transform-lower-pack.mlir b/mlir/test/Dialect/Linalg/transform-lower-pack.mlir index 5f8ff36a16578..81fd7a8a947d7 100644 --- a/mlir/test/Dialect/Linalg/transform-lower-pack.mlir +++ b/mlir/test/Dialect/Linalg/transform-lower-pack.mlir @@ -4,7 +4,7 @@ func.func @pack(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<17x2x16x16x32x8xf32>) -> tensor<17x2x16x16x32x8xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose + // linalg.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose // CHECK: tensor.pad {{.*}} low[0, 0, 0, 0] // CHECK: : tensor<129x47x16x16xf32> to tensor<136x64x16x16xf32> // CHECK: tensor.expand_shape %{{.*}} [{{.*}}[0, 1], [2, 3], [4], [5]] @@ -13,16 +13,16 @@ func.func @pack(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<17x2x16x16x32x8xf // CHECK-SAME: ins(%{{.*}} : tensor<17x8x2x32x16x16xf32>) // CHECK-SAME: outs(%{{.*}} : tensor<17x2x16x16x32x8xf32>) // CHECK-SAME: permutation = [0, 2, 4, 5, 3, 1] - %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 : tensor<129x47x16x16xf32> -> tensor<17x2x16x16x32x8xf32> return %pack : tensor<17x2x16x16x32x8xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -33,7 +33,7 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @pack( func.func @pack(%arg0: tensor<128x8xf32>, %arg1: tensor<8x8x16x1xf32>) -> tensor<8x8x16x1xf32> { - // tensor.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose + // linalg.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose // CHECK: tensor.pad {{.*}} low[0, 0] // CHECK: : tensor<128x8xf32> to tensor<128x8xf32> // CHECK: tensor.expand_shape %{{.*}} [{{.*}}[0, 1], [2, 3]] @@ -43,7 +43,7 @@ func.func @pack(%arg0: tensor<128x8xf32>, %arg1: tensor<8x8x16x1xf32>) -> tensor // CHECK-SAME: outs(%{{.*}} : tensor<8x8x16x1xf32>) // CHECK-SAME: permutation = [0, 2, 1, 3] - %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %arg1 + %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %arg1 : tensor<128x8xf32> -> tensor<8x8x16x1xf32> return %pack : tensor<8x8x16x1xf32> @@ -51,9 +51,9 @@ func.func @pack(%arg0: tensor<128x8xf32>, %arg1: tensor<8x8x16x1xf32>) -> tensor module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -67,7 +67,7 @@ module attributes {transform.with_named_sequence} { func.func @pack_as_pad(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x136x64x16x16xf32>) -> tensor<1x1x1x1x136x64x16x16xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.pack is lowered to tensor.pad + tensor.insert_slice + // linalg.pack is lowered to tensor.pad + tensor.insert_slice // CHECK: %[[PAD:.*]] = tensor.pad %[[SRC]] low[0, 0, 0, 0] high[7, 17, 0, 0] // CHECK: : tensor<129x47x16x16xf32> to tensor<136x64x16x16xf32> // CHECK: %[[RES:.*]] = tensor.insert_slice %[[PAD]] into %[[OUT]] @@ -79,16 +79,16 @@ func.func @pack_as_pad(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x13 // CHECK-SAME: [1, 1, 1, 1, 1, 1, 1, 1] // CHECK-SAME: : tensor<136x64x16x16xf32> into tensor<1x1x1x1x136x64x16x16xf32> // CHECK: return %[[RES]] - %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<129x47x16x16xf32> -> tensor<1x1x1x1x136x64x16x16xf32> return %pack : tensor<1x1x1x1x136x64x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -101,22 +101,22 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @pack_as_pad_disabled_insert_slice( func.func @pack_as_pad_disabled_insert_slice(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x136x64x16x16xf32>) -> tensor<1x1x1x1x136x64x16x16xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose + // linalg.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose // CHECK-SAME: %[[ARG0:[^:]*]]: tensor<129x47x16x16xf32> // CHECK-DAG: %[[PAD:.*]] = tensor.pad %[[ARG0]] // CHECK-NOT: %[[RES:.*]] = tensor.insert_slice %[[PAD]] // CHECK: %[[PAD_EXPANDED:.*]] = tensor.expand_shape %[[PAD]] // CHECK-DAG: %[[RES:.*]] = linalg.transpose ins(%[[PAD_EXPANDED]] - %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<129x47x16x16xf32> -> tensor<1x1x1x1x136x64x16x16xf32> return %pack : tensor<1x1x1x1x136x64x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack {lowerPadLikeWithInsertSlice = false}: (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack {lowerPadLikeWithInsertSlice = false}: (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -141,16 +141,16 @@ func.func @pack_not_a_pad(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x16x // CHECK-SAME: outs(%{{.*}} : tensor<1x1x16x16x136x64xf32>) // CHECK-SAME: permutation = [0, 2, 4, 5, 1, 3] - %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1] inner_tiles = [136, 64] into %arg1 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1] inner_tiles = [136, 64] into %arg1 : tensor<129x47x16x16xf32> -> tensor<1x1x16x16x136x64xf32> return %pack : tensor<1x1x16x16x136x64xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -172,16 +172,16 @@ func.func @unpack(%arg0: tensor<17x2x16x16x32x8xf32>, %arg1: tensor<129x47x16x16 // CHECK-SAME: : tensor<136x64x16x16xf32> to tensor<129x47x16x16xf32> // CHECK: linalg.copy ins(%[[SLICE]] : tensor<129x47x16x16xf32>) // CHECK-SAME: outs(%[[ARG1]] : tensor<129x47x16x16xf32>) - %unpack = tensor.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 + %unpack = linalg.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 : tensor<17x2x16x16x32x8xf32> -> tensor<129x47x16x16xf32> return %unpack : tensor<129x47x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -207,16 +207,16 @@ func.func @unpack_with_identity_outer_dims_perm(%arg0: tensor<17x2x16x16x32x8xf3 // CHECK-SAME: : tensor<136x64x16x16xf32> to tensor<129x47x16x16xf32> // CHECK: linalg.copy ins(%[[SLICE]] : tensor<129x47x16x16xf32>) // CHECK-SAME: outs(%[[ARG1]] : tensor<129x47x16x16xf32>) - %unpack = tensor.unpack %arg0 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 + %unpack = linalg.unpack %arg0 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 : tensor<17x2x16x16x32x8xf32> -> tensor<129x47x16x16xf32> return %unpack : tensor<129x47x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -241,16 +241,16 @@ func.func @unpack_as_pad(%arg0: tensor<1x1x1x1x136x64x16x16xf32>, %arg1: tensor< // strides multiplers. // CHECK-SAME: [1, 1, 1, 1, 1, 1, 1, 1] // CHECK-SAME: : tensor<1x1x1x1x136x64x16x16xf32> to tensor<129x47x16x16xf32> - %pack = tensor.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<1x1x1x1x136x64x16x16xf32> -> tensor<129x47x16x16xf32> return %pack : tensor<129x47x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -267,22 +267,22 @@ module attributes {transform.with_named_sequence} { func.func @unpack_as_pad_disabled_extract_slice(%arg0: tensor<1x1x1x1x136x64x16x16xf32>, %arg1: tensor<129x47x16x16xf32>) -> tensor<129x47x16x16xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.unpack is lowered to tensor.extract_slice + linalg.transpose + tensor.collapse_shape + // linalg.unpack is lowered to tensor.extract_slice + linalg.transpose + tensor.collapse_shape // CHECK-DAG: %[[ARG0:[^:]*]]: tensor<1x1x1x1x136x64x16x16xf32> // CHECK-NOT: %[[RES:.*]] = tensor.extract_slice %[[ARG0]] // CHECK: %[[TRANSPOSED:.*]] = linalg.transpose ins(%[[ARG0]] // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[TRANSPOSED]] // CHECK-DAG: %[[RES:.*]] = tensor.extract_slice %[[COLLAPSED]] - %pack = tensor.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<1x1x1x1x136x64x16x16xf32> -> tensor<129x47x16x16xf32> return %pack : tensor<129x47x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack {lowerUnpadLikeWithExtractSlice = false}: (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack {lowerUnpadLikeWithExtractSlice = false}: (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -305,7 +305,7 @@ func.func @pack_with_outer_dims_perm(%src: tensor<100x200x128x256xi32>, // CHECK-SAME: ins(%{{.*}} : tensor<100x200x4x32x16x16xi32>) // CHECK-SAME: outs(%{{.*}} : tensor<200x4x16x100x16x32xi32>) // CHECK-SAME: permutation = [1, 2, 4, 0, 5, 3] - %0 = tensor.pack %src + %0 = linalg.pack %src outer_dims_perm = [1, 2, 3, 0] inner_dims_pos = [3, 2] inner_tiles = [16, 32] @@ -315,9 +315,9 @@ func.func @pack_with_outer_dims_perm(%src: tensor<100x200x128x256xi32>, module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -337,7 +337,7 @@ func.func @pack_with_pad(%src: tensor<4225x12xf32>, %dest: tensor<265x16x16x1xf3 // CHECK-SAME: outs(%{{[a-zA-Z0-9]*}} : tensor<265x16x16x1xf32>) // CHECK-SAME: permutation = [0, 2, 1, 3] %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.pack %src + %0 = linalg.pack %src padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %dest @@ -347,9 +347,9 @@ func.func @pack_with_pad(%src: tensor<4225x12xf32>, %dest: tensor<265x16x16x1xf3 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -370,7 +370,7 @@ func.func @pack_with_pad_and_outer_dims_perm(%src: tensor<100x200x127x255xi32>, // CHECK-SAME: outs(%{{.*}} : tensor<200x4x16x100x16x32xi32>) // CHECK-SAME: permutation = [1, 2, 4, 0, 5, 3] %cst_0 = arith.constant 0 : i32 - %0 = tensor.pack %src + %0 = linalg.pack %src padding_value(%cst_0 : i32) outer_dims_perm = [1, 2, 3, 0] inner_dims_pos = [3, 2] @@ -381,9 +381,9 @@ func.func @pack_with_pad_and_outer_dims_perm(%src: tensor<100x200x127x255xi32>, module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -429,7 +429,7 @@ func.func @dynamic_pack_pad_transpose_inner_and_outer_dims(%source: tensor - %pack = tensor.pack %source padding_value(%padding_value : f32) + %pack = linalg.pack %source padding_value(%padding_value : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %init_pack : tensor -> tensor return %pack : tensor @@ -437,9 +437,9 @@ func.func @dynamic_pack_pad_transpose_inner_and_outer_dims(%source: tensor !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -453,7 +453,7 @@ module attributes {transform.with_named_sequence} { func.func @pack_as_pad_with_outer_dims_perm(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x136x64x16x16xf32>) -> tensor<1x1x1x1x136x64x16x16xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.pack is lowered to tensor.pad + tensor.insert_slice + // linalg.pack is lowered to tensor.pad + tensor.insert_slice // CHECK: %[[PAD:.*]] = tensor.pad %[[SRC]] low[0, 0, 0, 0] high[7, 17, 0, 0] // CHECK: : tensor<129x47x16x16xf32> to tensor<136x64x16x16xf32> // CHECK: %[[RES:.*]] = tensor.insert_slice %[[PAD]] into %[[OUT]] @@ -465,7 +465,7 @@ func.func @pack_as_pad_with_outer_dims_perm(%arg0: tensor<129x47x16x16xf32>, %ar // CHECK-SAME: [1, 1, 1, 1, 1, 1, 1, 1] // CHECK-SAME: : tensor<136x64x16x16xf32> into tensor<1x1x1x1x136x64x16x16xf32> // CHECK: return %[[RES]] - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) outer_dims_perm = [1, 2, 3, 0] inner_dims_pos = [0, 1, 2, 3] @@ -476,9 +476,9 @@ func.func @pack_as_pad_with_outer_dims_perm(%arg0: tensor<129x47x16x16xf32>, %ar module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -501,7 +501,7 @@ func.func @pack_as_pad_with_unit_dims(%arg0: tensor<3x1x1x1xf32>, %arg1: tensor< // CHECK-SAME: outs(%[[OUT]] : tensor<1x1x1x1x8x1xf32>) // CHECK-SAME: permutation = [0, 2, 4, 5, 1, 3] // CHECK: return %[[TRANSPOSED]] : tensor<1x1x1x1x8x1xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 padding_value(%zero : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %arg1 : tensor<3x1x1x1xf32> -> tensor<1x1x1x1x8x1xf32> @@ -512,9 +512,9 @@ func.func @pack_as_pad_with_unit_dims(%arg0: tensor<3x1x1x1xf32>, %arg1: tensor< module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -541,16 +541,16 @@ module attributes {transform.with_named_sequence} { // CHECK: linalg.copy ins(%[[SLICE]] : tensor<32x?x?xf32>) // CHECK-SAME: outs(%[[ARG1]] : tensor<32x?x?xf32>) func.func @unpack_with_dynamic_dest(%arg0: tensor<32x2x49x16x16xf32>, %arg1: tensor<32x?x?xf32>) -> tensor<32x?x?xf32> { - %pack = tensor.unpack %arg0 inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %arg1 + %pack = linalg.unpack %arg0 inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %arg1 : tensor<32x2x49x16x16xf32> -> tensor<32x?x?xf32> return %pack : tensor<32x?x?xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -582,15 +582,15 @@ module attributes {transform.with_named_sequence} { // CHECK: linalg.copy ins(%[[SLICE]] : tensor) // CHECK-SAME: outs(%[[ARG1]] : tensor) func.func @unpack_with_dynamic_input_dest(%arg0: tensor, %arg1: tensor) -> tensor { - %unpack = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %arg1 : tensor -> tensor + %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %arg1 : tensor -> tensor return %unpack : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -626,14 +626,14 @@ module attributes {transform.with_named_sequence} { // CHECK: linalg.copy ins(%[[SLICE]] : tensor) // CHECK-SAME: outs(%[[ARG1]] : tensor) func.func @unpack_fully_dynamic(%source: tensor, %dest: tensor, %tile_n : index, %tile_m : index) -> tensor { - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor return %0 : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -664,16 +664,16 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: [1, 1, 1, 1, 1, 1, 1, 1] // CHECK-SAME: : tensor<1x1x1x1x136x64x16x16xf32> to tensor func.func @unpack_as_pad_dynamic(%arg0: tensor<1x1x1x1x136x64x16x16xf32>, %arg1: tensor) -> tensor { - %pack = tensor.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<1x1x1x1x136x64x16x16xf32> -> tensor return %pack : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -698,16 +698,16 @@ module attributes {transform.with_named_sequence} { // CHECK: linalg.copy ins(%[[SLICE]] // CHECK-SAME: : tensor<32x64xf32>) outs(%[[ARG0]] : tensor<32x64xf32>) -> tensor<32x64xf32> func.func @unpack_with_outer_dims_perm(%arg0: tensor<32x64xf32>, %arg1: tensor<2x4x32x8xf32>) -> tensor<32x64xf32> { - %unpack = tensor.unpack %arg1 outer_dims_perm = [1, 0] + %unpack = linalg.unpack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg0 : tensor<2x4x32x8xf32> -> tensor<32x64xf32> return %unpack : tensor<32x64xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir index ac1ca9319d335..20019424e8d3c 100644 --- a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir @@ -106,12 +106,12 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @unpack_elemwise // CHECK: %[[RES:.*]] = scf.for // CHECK: scf.for -// CHECK: tensor.unpack +// CHECK: linalg.unpack // CHECK: linalg.elemwise_unary // CHECK: return %[[RES]] func.func @unpack_elemwise(%arg0: tensor<16x48x8x8xf32>, %arg1: tensor<128x384xf32>) -> tensor<128x384xf32> { %0 = tensor.empty() : tensor<128x384xf32> - %1 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 + %1 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 : tensor<16x48x8x8xf32> -> tensor<128x384xf32> %2 = linalg.elemwise_unary ins(%1: tensor<128x384xf32>) outs(%arg1: tensor<128x384xf32>) -> tensor<128x384xf32> @@ -132,12 +132,12 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @pack_elemwise // CHECK: %[[RES:.*]] = scf.for // CHECK: scf.for -// CHECK: tensor.pack +// CHECK: linalg.pack // CHECK: linalg.elemwise_unary // CHECK: return %[[RES]] func.func @pack_elemwise(%arg0: tensor<128x384xf32>, %arg1: tensor<16x48x8x8xf32>) -> tensor<16x48x8x8xf32> { %0 = tensor.empty() : tensor<16x48x8x8xf32> - %1 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 + %1 = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 : tensor<128x384xf32> -> tensor<16x48x8x8xf32> %2 = linalg.elemwise_unary ins(%1: tensor<16x48x8x8xf32>) outs(%arg1: tensor<16x48x8x8xf32>) -> tensor<16x48x8x8xf32> @@ -156,14 +156,14 @@ module attributes {transform.with_named_sequence} { // ----- // CHECK-LABEL: func.func @nofuse_pack_elemwise -// CHECK: tensor.pack +// CHECK: linalg.pack // CHECK: %[[RES:.*]] = scf.for // CHECK: scf.for // CHECK: linalg.elemwise_unary // CHECK: return %[[RES]] func.func @nofuse_pack_elemwise(%arg0: tensor<128x384xf32>, %arg1: tensor<16x48x8x8xf32>) -> tensor<16x48x8x8xf32> { %0 = tensor.empty() : tensor<16x48x8x8xf32> - %1 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 + %1 = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 : tensor<128x384xf32> -> tensor<16x48x8x8xf32> %2 = linalg.elemwise_unary ins(%1: tensor<16x48x8x8xf32>) outs(%arg1: tensor<16x48x8x8xf32>) -> tensor<16x48x8x8xf32> diff --git a/mlir/test/Dialect/Linalg/transform-op-pack.mlir b/mlir/test/Dialect/Linalg/transform-op-pack.mlir index 6c26ebd0a5b84..b3ad73e8df8e7 100644 --- a/mlir/test/Dialect/Linalg/transform-op-pack.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-pack.mlir @@ -15,9 +15,9 @@ // CHECK-SAME: %[[T1:.+]]: tensor<3xf16> func.func @reduction_2d_static(%t0: tensor<3x7xf16>, %t1: tensor<3xf16>) -> tensor<3xf16> { // CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<3x2x4xf16> - // CHECK: %[[PACKED:.*]] = tensor.pack %[[T0]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED:.*]] = linalg.pack %[[T0]] padding_value(%{{.*}} : f16) // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [4] into %[[EMPTY]] : tensor<3x7xf16> -> tensor<3x2x4xf16> - // CHECK-NOT: tensor.pack + // CHECK-NOT: linalg.pack // CHECK: linalg.generic // CHECK-SAME: indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]]] // CHECK-SAME: iterator_types = ["parallel", "reduction", "reduction"] @@ -29,7 +29,7 @@ func.func @reduction_2d_static(%t0: tensor<3x7xf16>, %t1: tensor<3xf16>) -> tens linalg.yield %3 : f16 } -> tensor<3xf16> - // CHECK-NOT: tensor.unpack + // CHECK-NOT: linalg.unpack return %2 : tensor<3xf16> } @@ -59,9 +59,9 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[T1:.+]]: tensor<3xf16> func.func @col_reduction_2d_static(%t0: tensor<7x3xf16>, %t1: tensor<3xf16>) -> tensor<3xf16> { // CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<3x2x4xf16> - // CHECK: %[[PACKED:.*]] = tensor.pack %[[T0]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED:.*]] = linalg.pack %[[T0]] padding_value(%{{.*}} : f16) // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [0] inner_tiles = [4] into %[[EMPTY]] : tensor<7x3xf16> -> tensor<3x2x4xf16> - // CHECK-NOT: tensor.pack + // CHECK-NOT: linalg.pack // CHECK: linalg.generic // CHECK-SAME: indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]]] // CHECK-SAME: iterator_types = ["reduction", "parallel", "reduction"] @@ -73,7 +73,7 @@ func.func @col_reduction_2d_static(%t0: tensor<7x3xf16>, %t1: tensor<3xf16>) -> linalg.yield %3 : f16 } -> tensor<3xf16> - // CHECK-NOT: tensor.unpack + // CHECK-NOT: linalg.unpack return %2 : tensor<3xf16> } @@ -83,12 +83,12 @@ module attributes {transform.with_named_sequence} { %1 = transform.structured.pack %0 packed_sizes = [4, 0] : (!transform.any_op) -> (!transform.op<"linalg.generic">) %pack = transform.get_producer_of_operand %1[0] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.pack">) %2, %pack_2, %empty_unpack_2 = transform.structured.pack_transpose %pack with_compute_op(%1) outer_perm = [1, 0] - : (!transform.op<"tensor.pack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.any_op) + : (!transform.op<"linalg.pack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.any_op) transform.yield } } @@ -116,9 +116,9 @@ func.func @reduction_2d_dynamic(%t0: tensor, %t1: tensor) -> ten // CHECK-DAG: %[[D1:.*]] = tensor.dim %[[T0]], %[[C1]] : tensor // CHECK: %[[D1B4:.*]] = affine.apply #[[$DIV4]]()[%[[D1]]] // CHECK: %[[EMPTY:.*]] = tensor.empty(%[[D0]], %[[D1B4]]) : tensor - // CHECK: %[[PACKED:.*]] = tensor.pack %[[T0]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED:.*]] = linalg.pack %[[T0]] padding_value(%{{.*}} : f16) // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [4] into %[[EMPTY]] : tensor -> tensor - // CHECK-NOT: tensor.pack + // CHECK-NOT: linalg.pack // CHECK: linalg.generic // CHECK-SAME: indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]]] // CHECK-SAME: iterator_types = ["parallel", "reduction", "reduction"] @@ -130,7 +130,7 @@ func.func @reduction_2d_dynamic(%t0: tensor, %t1: tensor) -> ten linalg.yield %3 : f16 } -> tensor - // CHECK-NOT: tensor.unpack + // CHECK-NOT: linalg.unpack return %2 : tensor } @@ -162,11 +162,11 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[T0:.+]]: tensor, // CHECK-SAME: %[[T1:.+]]: tensor func.func @reduction_2d_dynamic(%t0: tensor, %t1: tensor) -> tensor { - // CHECK: %[[PACKED_0:.*]] = tensor.pack %[[T0]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED_0:.*]] = linalg.pack %[[T0]] padding_value(%{{.*}} : f16) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [3, 4] into %{{.*}} : tensor -> tensor - // CHECK: %[[PACKED_1:.*]] = tensor.pack %[[T1]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED_1:.*]] = linalg.pack %[[T1]] padding_value(%{{.*}} : f16) // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [3] into %{{.*}} : tensor -> tensor - // CHECK-NOT: tensor.pack + // CHECK-NOT: linalg.pack // CHECK: linalg.generic // CHECK-SAME: indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]]] // CHECK-SAME: iterator_types = ["parallel", "reduction", "parallel", "reduction"] @@ -178,7 +178,7 @@ func.func @reduction_2d_dynamic(%t0: tensor, %t1: tensor) -> ten linalg.yield %3 : f16 } -> tensor - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0] inner_tiles = [3] into %{{.*}} : tensor -> tensor + // CHECK: linalg.unpack %{{.*}} inner_dims_pos = [0] inner_tiles = [3] into %{{.*}} : tensor -> tensor return %2 : tensor } @@ -207,11 +207,11 @@ module attributes {transform.with_named_sequence} { func.func @matmul(%A: tensor, %B: tensor, %C: tensor) -> tensor { - // CHECK: %[[PACK_A:.*]] = tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [2, 4] + // CHECK: %[[PACK_A:.*]] = linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [2, 4] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_B:.*]] = tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [3, 4] + // CHECK: %[[PACK_B:.*]] = linalg.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [3, 4] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_C:.*]] = tensor.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [3, 2] + // CHECK: %[[PACK_C:.*]] = linalg.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [3, 2] // CHECK-SAME: : tensor -> tensor // CHECK: linalg.generic {indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]], #[[$PACKED_MAP_2]]] @@ -222,7 +222,7 @@ func.func @matmul(%A: tensor, %B: tensor, %C: tensor) outs(%C: tensor) -> tensor - // CHECK: tensor.unpack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [3, 2] + // CHECK: linalg.unpack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [3, 2] // CHECK-SAME: : tensor -> tensor return %0 : tensor } @@ -235,12 +235,12 @@ module attributes {transform.with_named_sequence} { : (!transform.any_op) -> (!transform.op<"linalg.generic">) %unpack = transform.get_consumers_of_result %1[0] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.unpack">) %2, %pack_2, %unpack_2 = transform.structured.pack_transpose %unpack with_compute_op(%1) outer_perm = [1, 0] inner_perm = [1, 0] - : (!transform.op<"tensor.unpack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.op<"linalg.unpack">) transform.yield } } @@ -259,11 +259,11 @@ module attributes {transform.with_named_sequence} { func.func @conv_2d_nchw_fchw(%i: tensor<14x512x28x28xf32>, %f: tensor<1024x512x1x1xf32>, %o: tensor<14x1024x28x28xf32>) -> tensor<14x1024x28x28xf32> { - // CHECK: %[[PACK_INPUT:.*]] = tensor.pack %{{.*}} inner_dims_pos = [1] inner_tiles = [8] + // CHECK: %[[PACK_INPUT:.*]] = linalg.pack %{{.*}} inner_dims_pos = [1] inner_tiles = [8] // CHECK-SAME: : tensor<14x512x28x28xf32> -> tensor<14x64x28x28x8xf32> - // CHECK: %[[PACK_FILTER:.*]] = tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] + // CHECK: %[[PACK_FILTER:.*]] = linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] // CHECK-SAME: : tensor<1024x512x1x1xf32> -> tensor<256x64x1x1x4x8xf32> - // CHECK: %[[PACK_INPUT:.*]] = tensor.pack %{{.*}} inner_dims_pos = [1] inner_tiles = [4] + // CHECK: %[[PACK_INPUT:.*]] = linalg.pack %{{.*}} inner_dims_pos = [1] inner_tiles = [4] // CHECK-SAME: : tensor<14x1024x28x28xf32> -> tensor<14x256x28x28x4xf32> // CHECK: linalg.generic {indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]], #[[$PACKED_MAP_2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "parallel", "reduction"]} @@ -272,7 +272,7 @@ func.func @conv_2d_nchw_fchw(%i: tensor<14x512x28x28xf32>, %f: tensor<1024x512x1 %0 = linalg.conv_2d_nchw_fchw ins(%i, %f: tensor<14x512x28x28xf32>, tensor<1024x512x1x1xf32>) outs(%o: tensor<14x1024x28x28xf32>) -> tensor<14x1024x28x28xf32> - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [1] inner_tiles = [4] + // CHECK: linalg.unpack %{{.*}} inner_dims_pos = [1] inner_tiles = [4] // CHECK-SAME: : tensor<14x256x28x28x4xf32> -> tensor<14x1024x28x28xf32> return %0: tensor<14x1024x28x28xf32> } @@ -300,11 +300,11 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[INIT:.+]]: tensor func.func @conv_2d_nhwc_hwcf(%input: tensor, %filter: tensor<1x?x?x?xf32>, %init: tensor) -> tensor { - // CHECK: %[[PACK_INPUT:.*]] = tensor.pack %{{.*}} inner_dims_pos = [3] inner_tiles = [6] + // CHECK: %[[PACK_INPUT:.*]] = linalg.pack %{{.*}} inner_dims_pos = [3] inner_tiles = [6] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_FILTER:.*]] = tensor.pack %{{.*}} inner_dims_pos = [3, 2] inner_tiles = [4, 6] + // CHECK: %[[PACK_FILTER:.*]] = linalg.pack %{{.*}} inner_dims_pos = [3, 2] inner_tiles = [4, 6] // CHECK-SAME: : tensor<1x?x?x?xf32> -> tensor<1x?x?x?x4x6xf32> - // CHECK: %[[PACK_OUTPUT:.*]] = tensor.pack %{{.*}} inner_dims_pos = [3] inner_tiles = [4] + // CHECK: %[[PACK_OUTPUT:.*]] = linalg.pack %{{.*}} inner_dims_pos = [3] inner_tiles = [4] // CHECK-SAME: : tensor -> tensor // CHECK: linalg.generic {indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]], #[[$PACKED_MAP_2]]] @@ -315,7 +315,7 @@ func.func @conv_2d_nhwc_hwcf(%input: tensor, %filter: tensor<1x?x?x ins (%input, %filter: tensor, tensor<1x?x?x?xf32>) outs (%init: tensor) -> tensor - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [3] inner_tiles = [4] + // CHECK: linalg.unpack %{{.*}} inner_dims_pos = [3] inner_tiles = [4] // CHECK-SAME: : tensor -> tensor return %0 : tensor } @@ -349,11 +349,11 @@ func.func @matmul_dynamic_pack_size(%A: tensor, %B: tensor, %C // CHECK: %[[TS:.*]] = "some_tile_size"() : () -> index %sz = "some_tile_size"() : () -> (index) - // CHECK: %[[PACK_A:.*]] = tensor.pack %[[A]] {{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] + // CHECK: %[[PACK_A:.*]] = linalg.pack %[[A]] {{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_B:.*]] = tensor.pack %[[B]] {{.*}} inner_dims_pos = [1, 0] inner_tiles = [%[[TS]], %[[TS]]] + // CHECK: %[[PACK_B:.*]] = linalg.pack %[[B]] {{.*}} inner_dims_pos = [1, 0] inner_tiles = [%[[TS]], %[[TS]]] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_C:.*]] = tensor.pack %[[C]] {{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] + // CHECK: %[[PACK_C:.*]] = linalg.pack %[[C]] {{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] // CHECK-SAME: : tensor -> tensor // CHECK: linalg.generic {indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]], #[[$PACKED_MAP_2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "reduction"]} @@ -363,7 +363,7 @@ func.func @matmul_dynamic_pack_size(%A: tensor, %B: tensor, %C outs(%C: tensor) -> tensor - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] into %[[C]] + // CHECK: linalg.unpack %{{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] into %[[C]] // CHECK-SAME: : tensor -> tensor return %0 : tensor } @@ -445,16 +445,16 @@ module attributes {transform.with_named_sequence} { // ----- func.func @no_single_packing_op(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) { - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> - %1 = tensor.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> - %2 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %1 = linalg.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> + %2 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> return } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{requires target to map to exactly 1 packing op and 1 packed op (got 2 and 1)}} transform.structured.pack_transpose %0 with_compute_op(%1) inner_perm = [0] @@ -476,7 +476,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["tensor.empty"]} in %arg1 : (!transform.any_op) -> !transform.any_op - // expected-error @below {{requires target to map to a tensor.pack or tensor.unpack}} + // expected-error @below {{requires target to map to a linalg.pack or linalg.unpack}} transform.structured.pack_transpose %0 with_compute_op(%1) inner_perm = [0] : (!transform.any_op, !transform.any_op) @@ -488,14 +488,14 @@ module attributes {transform.with_named_sequence} { // ----- func.func @no_linalg_target(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) { - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> %1 = arith.constant 0 : index return } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{requires a LinalgOp target}} transform.structured.pack_transpose %0 with_compute_op(%1) @@ -509,7 +509,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @no_single_use_by_linalg(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) { - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> %f0 = arith.constant 0.0 : f32 %1 = tensor.empty() : tensor %2 = linalg.fill ins(%f0: f32) outs(%1 : tensor) -> tensor @@ -518,7 +518,7 @@ func.func @no_single_use_by_linalg(%source: tensor<128x256xf32>, %dest: tensor<4 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{not a single use by the LinalgOp target}} transform.structured.pack_transpose %0 with_compute_op(%1) @@ -532,8 +532,8 @@ module attributes {transform.with_named_sequence} { // ----- func.func @not_produced_by_linalg(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) { - %a = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> - %b = tensor.unpack %a inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> + %a = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %b = linalg.unpack %a inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> %f0 = arith.constant 0.0 : f32 %1 = tensor.empty() : tensor %2 = linalg.fill ins(%f0: f32) outs(%1 : tensor) -> tensor @@ -542,7 +542,7 @@ func.func @not_produced_by_linalg(%source: tensor<128x256xf32>, %dest: tensor<4x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{not produced by the LinalgOp target}} transform.structured.pack_transpose %0 with_compute_op(%1) @@ -559,13 +559,13 @@ func.func @no_matching_pack(%source: tensor<16xf32>) { %f0 = arith.constant 0.0 : f32 %1 = tensor.empty() : tensor<4x4xf32> %2 = linalg.fill ins(%f0: f32) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32> - %b = tensor.unpack %2 inner_dims_pos = [0] inner_tiles = [4] into %source : tensor<4x4xf32> -> tensor<16xf32> + %b = linalg.unpack %2 inner_dims_pos = [0] inner_tiles = [4] into %source : tensor<4x4xf32> -> tensor<16xf32> return } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{could not find matching pack op}} transform.structured.pack_transpose %0 with_compute_op(%1) @@ -593,13 +593,13 @@ module attributes {transform.with_named_sequence} { : (!transform.any_op) -> (!transform.op<"linalg.generic">) %unpack = transform.get_consumers_of_result %1[0] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.unpack">) %2, %pack_2, %unpack_2 = // expected-error @below {{invalid outer_perm}} transform.structured.pack_transpose %unpack with_compute_op(%1) outer_perm = [1] - : (!transform.op<"tensor.unpack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.op<"linalg.unpack">) transform.yield } } @@ -621,13 +621,13 @@ module attributes {transform.with_named_sequence} { : (!transform.any_op) -> (!transform.op<"linalg.generic">) %unpack = transform.get_consumers_of_result %1[0] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.unpack">) %2, %pack_2, %unpack_2 = // expected-error @below {{invalid inner_perm}} transform.structured.pack_transpose %unpack with_compute_op(%1) inner_perm = [1] - : (!transform.op<"tensor.unpack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.op<"linalg.unpack">) transform.yield } } @@ -643,12 +643,12 @@ func.func @no_padding_on_packs(%A: tensor<32x32xf32>, %B: tensor<32x32xf32>, %C: } // CHECK-LABEL: no_padding_on_packs -// CHECK: tensor.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] +// CHECK: linalg.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<8x4x4x8xf32> -// CHECK: tensor.pack %{{.+}} outer_dims_perm = [1, 0] +// CHECK: linalg.pack %{{.+}} outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 8] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<4x4x8x8xf32> -// CHECK: tensor.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] +// CHECK: linalg.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<8x4x4x8xf32> module attributes {transform.with_named_sequence} { @@ -657,12 +657,12 @@ module attributes {transform.with_named_sequence} { %1 = transform.structured.pack %0 packed_sizes = [4, 8, 8] : (!transform.any_op) -> (!transform.op<"linalg.generic">) %pack = transform.get_producer_of_operand %1[1] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.pack">) %2, %pack_2, %empty_unpack_2 = transform.structured.pack_transpose %pack with_compute_op(%1) outer_perm = [1, 0] inner_perm = [1, 0] - : (!transform.op<"tensor.pack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.any_op) + : (!transform.op<"linalg.pack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.any_op) transform.yield } } diff --git a/mlir/test/Dialect/Linalg/transform-op-tile-pack-unpack.mlir b/mlir/test/Dialect/Linalg/transform-op-tile-pack-unpack.mlir new file mode 100644 index 0000000000000..456a5ea453963 --- /dev/null +++ b/mlir/test/Dialect/Linalg/transform-op-tile-pack-unpack.mlir @@ -0,0 +1,491 @@ +// RUN: mlir-opt %s -transform-interpreter -canonicalize -cse -split-input-file | FileCheck %s + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 32)> +// CHECK: func.func @NC_to_NCnc +// CHECK-SAME: %[[IN:.*]]: tensor<128x256xf32>, +// CHECK-SAME: %[[OUT:.*]]: tensor<4x8x32x32xf32>) -> tensor<4x8x32x32xf32> { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[RES0:.*]] = scf.for %[[N:.*]] = %[[C0]] to %[[C4]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor<4x8x32x32xf32>) { +// CHECK: %[[RES1:.+]] = scf.for %[[C:.*]] = %[[C0]] to %[[C8]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor<4x8x32x32xf32>) { +// CHECK-DAG: %[[IN_N:.+]] = affine.apply #[[MAP0]](%[[N]]) +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP0]](%[[C]]) +// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_N]], %[[IN_C]]] [64, 128] [1, 1] : tensor<128x256xf32> to tensor<64x128xf32> +// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[N]], %[[C]], 0, 0] [2, 4, 32, 32] [1, 1, 1, 1] : tensor<4x8x32x32xf32> to tensor<2x4x32x32xf32> +// CHECK: %[[SUB_RES:.*]] = linalg.pack +// CHECK-SAME: %[[SUB_IN]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[SUB_OUT]] +// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] +// CHECK: scf.yield %[[INSERT]] : tensor<4x8x32x32xf32> +// CHECK: } +// CHECK: scf.yield %[[RES1:.*]] : tensor<4x8x32x32xf32> +// CHECK: } +// CHECK: return %[[RES0:.*]] : tensor<4x8x32x32xf32> +// CHECK: } +func.func @NC_to_NCnc(%arg0: tensor<128x256xf32>, %arg1: tensor<4x8x32x32xf32>) -> tensor<4x8x32x32xf32> { + %0 = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg1 : tensor<128x256xf32> -> tensor<4x8x32x32xf32> + return %0 : tensor<4x8x32x32xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 8)> +// CHECK: func.func @KC_to_CKkc +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index +// CHECK: scf.for %[[C:.+]] = %[[C0]] to %[[C32]] step %[[C2]] +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP0]](%[[C]]) +// CHECK: %[[INPUT_SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK-SAME: [0, %[[IN_C]]] [128, 16] +// CHECK: %[[OUTPUT_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[C]], 0, 0, 0] [2, 4, 32, 8] +// CHECK: linalg.pack +// CHECK-SAME: %[[INPUT_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] +// CHECK-SAME: into %[[OUTPUT_SLICE]] +func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) -> tensor<32x4x32x8xf32> { + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<128x256xf32> -> tensor<32x4x32x8xf32> + return %0 : tensor<32x4x32x8xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * -2 + 15, 8)> +// CHECK: func.func @pad_and_pack_static( +// CHECK-SAME: %[[IN:.*]]: tensor<13x15xf32>, +// CHECK-SAME: %[[OUT:.*]]: tensor<2x8x8x2xf32>, +// CHECK-SAME: %[[PAD:.*]]: f32) -> tensor<2x8x8x2xf32> { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[RES0:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[C8]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[OUT]]) -> (tensor<2x8x8x2xf32>) { +// CHECK-DAG: %[[IN_J:.*]] = affine.apply #[[MAP0]](%[[J]]) +// CHECK-DAG: %[[IN_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]]) +// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][0, %[[IN_J]]] [13, %[[IN_J_SZ]]] [1, 1] +// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][0, %[[J]], 0, 0] [2, 4, 8, 2] [1, 1, 1, 1] +// CHECK: %[[SUB_RES:.*]] = linalg.pack +// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] +// CHECK-SAME: into %[[SUB_OUT]] +// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] +// CHECK: scf.yield %[[INSERT]] : tensor<2x8x8x2xf32> +// CHECK: } +// CHECK: return %[[RES0:.*]] : tensor<2x8x8x2xf32> +// CHECK: } +func.func @pad_and_pack_static(%input: tensor<13x15xf32>, %output: tensor<2x8x8x2xf32>, %pad: f32) -> tensor<2x8x8x2xf32> { + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<13x15xf32> -> tensor<2x8x8x2xf32> + return %0 : tensor<2x8x8x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 * 8)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -8 + s0, d0 * 8)> +// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)> +// CHECK: func.func @pad_and_pack_partially_dynamic( +// CHECK-SAME: %[[IN:.*]]: tensor, +// CHECK-SAME: %[[OUT:.*]]: tensor, +// CHECK-SAME: %[[PAD:.*]]: f32) -> tensor { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[OUT_D0:.*]] = tensor.dim %[[OUT]], %[[C0]] : tensor +// CHECK-DAG: %[[OUT_D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor +// CHECK: %[[RES0:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[OUT_D0]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor) { +// CHECK: %[[RES1:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[OUT_D1]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor) { +// CHECK-DAG: %[[OUT_I_SZ:.*]] = affine.min #[[MAP0]](%[[I]])[%[[OUT_D0]]] +// CHECK-DAG: %[[OUT_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]])[%[[OUT_D1]]] +// CHECK-DAG: %[[IN_I:.*]] = affine.apply #[[MAP2]](%[[I]]) +// CHECK-DAG: %[[IN_I_SZ:.*]] = affine.min #[[MAP3]] +// CHECK-DAG: %[[IN_J:.*]] = affine.apply #[[MAP4]](%[[J]]) +// CHECK-DAG: %[[IN_J_SZ:.*]] = affine.min #[[MAP5]] +// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_I]], %[[IN_J]]] [%[[IN_I_SZ]], %[[IN_J_SZ]]] [1, 1] : tensor to tensor +// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[I]], %[[J]], 0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]], 8, 2] [1, 1, 1, 1] : tensor to tensor +// CHECK: %[[SUB_RES:.*]] = linalg.pack +// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] +// CHECK-SAME: into %[[SUB_OUT]] +// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] +// CHECK: scf.yield %[[INSERT]] : tensor +// CHECK: } +// CHECK: scf.yield %[[RES1:.*]] : tensor +// CHECK: } +// CHECK: return %[[VAL_34:.*]] : tensor +// CHECK: } +func.func @pad_and_pack_partially_dynamic(%input: tensor, %output: tensor, %pad: f32) -> tensor { + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor -> tensor + return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s0, -(d1 * s0) + s1)> +// CHECK: func.func @pad_and_pack_fully_dynamic( +// CHECK-SAME: %[[IN:.*]]: tensor, +// CHECK-SAME: %[[OUT:.*]]: tensor, +// CHECK-SAME: %[[PAD:.*]]: f32, +// CHECK-SAME: %[[TILE_0:.*]]: index, +// CHECK-SAME: %[[TILE_1:.*]]: index) -> tensor { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[OUT_D0:.*]] = tensor.dim %[[OUT]], %[[C0]] : tensor +// CHECK-DAG: %[[OUT_D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor +// CHECK: %[[RES0:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[OUT_D0]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor) { +// CHECK: %[[RES1:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[OUT_D1]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor) { +// CHECK-DAG: %[[OUT_I_SZ:.*]] = affine.min #[[MAP0]](%[[I]])[%[[OUT_D0]]] +// CHECK-DAG: %[[OUT_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]])[%[[OUT_D1]]] +// CHECK-DAG: %[[IN_D0:.*]] = tensor.dim %[[IN]], %[[C0]] +// CHECK-DAG: %[[IN_D1:.*]] = tensor.dim %[[IN]], %[[C1]] +// CHECK: %[[IN_I:.*]] = affine.apply #[[MAP2]](%[[I]])[%[[TILE_0]]] +// CHECK: %[[IN_I_SZ:.*]] = affine.min #[[MAP3]](%[[OUT_I_SZ]], %[[I]])[%[[TILE_0]], %[[IN_D0]]] +// CHECK: %[[IN_J:.*]] = affine.apply #[[MAP2]](%[[J]])[%[[TILE_1]]] +// CHECK: %[[IN_J_SZ:.*]] = affine.min #[[MAP3]](%[[OUT_J_SZ]], %[[J]])[%[[TILE_1]], %[[IN_D1]]] +// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_I]], %[[IN_J]]] [%[[IN_I_SZ]], %[[IN_J_SZ]]] [1, 1] : tensor to tensor +// CHECK: %[[OUT_D2:.+]] = tensor.dim %[[ITER1]], %[[C2]] +// CHECK: %[[OUT_D3:.+]] = tensor.dim %[[ITER1]], %[[C3]] +// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[I]], %[[J]], 0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]], %[[OUT_D2]], %[[OUT_D3]]] [1, 1, 1, 1] : tensor to tensor +// CHECK: %[[PACK:.*]] = linalg.pack +// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_0]], %[[TILE_1]]] +// CHECK-SAME: into %[[SUB_OUT]] +// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[PACK]] into %[[ITER1]] +// CHECK: scf.yield %[[INSERT]] : tensor +// CHECK: } +// CHECK: scf.yield %[[RES1:.*]] : tensor +// CHECK: } +// CHECK: return %[[RES0:.*]] : tensor +// CHECK: } +func.func @pad_and_pack_fully_dynamic(%source: tensor, %dest: tensor, %pad: f32, %tile_n : index, %tile_m : index) -> tensor { + %0 = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor + return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)> +// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 16 - d0 floordiv 16 + 1)> +// CHECK: func.func @NCnc_to_NC +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index +// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index +// CHECK: %{{.+}} = scf.for %[[I:.+]] = %[[C0]] to %[[C256]] step %[[C2]] +// CHECK: %{{.+}} = scf.for %[[J:.+]] = %[[C0]] to %[[C128]] step %[[C4]] +// CHECK-DAG: %[[IN_I:.+]] = affine.apply #[[MAP0]](%[[I]]) +// CHECK-DAG: %[[OFFSET_I:.+]] = affine.apply #[[MAP1]](%[[I]]) +// CHECK-DAG: %[[IN_I_SZ:.+]] = affine.apply #[[MAP2]](%[[I]]) +// CHECK-DAG: %[[IN_J:.+]] = affine.apply #[[MAP4]](%[[J]]) +// CHECK-DAG: %[[OFFSET_J:.+]] = affine.apply #[[MAP5]](%[[J]]) +// CHECK-DAG: %[[IN_J_SZ:.+]] = affine.apply #[[MAP6]](%[[J]]) +// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK-SAME: [%[[IN_I]], %[[IN_J]], 0, 0] [%[[IN_I_SZ]], %[[IN_J_SZ]], 32, 16] +// CHECK-SAME: : tensor<8x8x32x16xf32> to tensor +// CHECK: %[[EMPTY:.+]] = tensor.empty +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[SLICE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] +// CHECK-SAME: into %[[EMPTY]] +// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] +// CHECK-SAME: [%[[OFFSET_I]], %[[OFFSET_J]]] [2, 4] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]] +// CHECK-SAME: into %{{.+}}[%[[I]], %[[J]]] [2, 4] +// CHECK: scf.yield %[[RES]] +func.func @NCnc_to_NC(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)> +// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 8)> +// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 8)> +// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 8 - d0 floordiv 8 + 1)> +// CHECK: func.func @CKkc_to_KC +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index +// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index +// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C128]] step %[[C2]] +// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C256]] step %[[C4]] +// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]]) +// CHECK-DAG: %[[OFFSET_K:.+]] = affine.apply #[[MAP1]](%[[K]]) +// CHECK-DAG: %[[IN_K_SZ:.+]] = affine.apply #[[MAP2]](%[[K]]) +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP4]](%[[C]]) +// CHECK-DAG: %[[OFFSET_C:.+]] = affine.apply #[[MAP5]](%[[C]]) +// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP6]](%[[C]]) +// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], %[[IN_K_SZ]], 32, 8] +// CHECK: %[[EMPTY:.+]] = tensor.empty +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] +// CHECK-SAME: into %[[EMPTY]] +// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] +// CHECK-SAME: [%[[OFFSET_K]], %[[OFFSET_C]]] [2, 4] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]] +// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4] +// CHECK: scf.yield %[[RES]] +func.func @CKkc_to_KC(%source: tensor<32x4x32x8xf32>, %dest: tensor<128x256xf32>) -> tensor<128x256xf32> { + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %dest : tensor<32x4x32x8xf32> -> tensor<128x256xf32> + return %0 : tensor<128x256xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 floordiv 4)> +// CHECK: func.func @perfect_CKkc_to_KC +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C2]] +// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C128]] step %[[C4]] +// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]]) +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP1]](%[[C]]) +// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [1, 1, 2, 4] +// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [2, 4] +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] +// CHECK-SAME: into %[[ITER_SLICE]] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] +// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4] +// CHECK: scf.yield %[[RES]] +func.func @perfect_CKkc_to_KC(%source: tensor<32x4x2x4xf32>, %dest: tensor<8x128xf32>) -> tensor<8x128xf32> { + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %dest : tensor<32x4x2x4xf32> -> tensor<8x128xf32> + return %0 : tensor<8x128xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 2)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 2)> +// CHECK: func.func @dynamic_perfect_CKkc_to_KC +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[DIM_0:.+]] = tensor.dim %[[OUT]], %[[C0]] +// CHECK-DAG: %[[DIM_1:.+]] = tensor.dim %[[OUT]], %[[C1]] +// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[DIM_0]] step %[[C2]] +// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[DIM_1]] step %[[C4]] +// CHECK-DAG: %[[OUT_K_SZ:.+]] = affine.min #[[MAP0]](%[[K]])[%[[DIM_0]]] +// CHECK-DAG: %[[OUT_C_SZ:.+]] = affine.min #[[MAP1]](%[[C]])[%[[DIM_1]]] +// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP2]](%[[K]]) +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP2]](%[[C]]) +// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP3]](%[[OUT_C_SZ]]) +// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], 1, 2, 2] +// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] +// CHECK-SAME: into %[[ITER_SLICE]] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] +// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]] +// CHECK: scf.yield %[[RES]] + +func.func @dynamic_perfect_CKkc_to_KC(%source: tensor, %dest: tensor) -> tensor { + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %dest : tensor -> tensor + return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK: #[[MAP:.+]] = affine_map<(d0) -> (d0 floordiv 2)> +// CHECK: func.func @perfect_NKPQk_to_NPQK( +// CHECK-SAME: %[[SOURCE:.+]]: tensor<1x4x6x6x2xf32>, +// CHECK-SAME: %{{.+}}: tensor<1x6x6x8xf32>) +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %{{.+}} = scf.for %[[P:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %{{.+}} = scf.for %[[Q:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C4]] +// CHECK: %[[K_SZ:.+]] = affine.apply #[[MAP]](%[[K]]) +// CHECK: %[[SLICE_SOURCE:.+]] = tensor.extract_slice %[[SOURCE]][0, %[[K_SZ]], %[[P]], %[[Q]], 0] +// CHECK: %[[SLICE_DEST:.+]] = tensor.extract_slice %{{.+}}[0, %[[P]], %[[Q]], %[[K]]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[SLICE_SOURCE]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] +// CHECK-SAME: into %[[SLICE_DEST]] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] +// CHECK-SAME: into %{{.+}}[0, %[[P]], %[[Q]], %[[K]]] +// CHECK: scf.yield %[[RES]] + +func.func @perfect_NKPQk_to_NPQK(%source: tensor<1x4x6x6x2xf32>, %dest: tensor<1x6x6x8xf32>) -> tensor<1x6x6x8xf32> { + %0 = linalg.unpack %source outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %dest : tensor<1x4x6x6x2xf32> -> tensor<1x6x6x8xf32> + return %0 : tensor<1x6x6x8xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +func.func private @get_dynamic_tile_size() -> index + +// CHECK-LABEL: func.func @fully_dynamic_unpack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DST:[0-9a-zA-Z]+]] +// CHECK: %[[INNER_TS:.+]] = call @get_dynamic_tile_size() : () -> index +// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[DST]]) +// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) +// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]] +// CHECK: %[[EMPTY:.+]] = tensor.empty +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SLICE]] +// CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [%[[INNER_TS]], %[[INNER_TS]]] into %[[EMPTY]] +func.func @fully_dynamic_unpack(%source: tensor, %dest: tensor) -> tensor { + %0 = func.call @get_dynamic_tile_size() : () -> index + %1 = linalg.unpack %source inner_dims_pos = [1, 0] inner_tiles = [%0, %0] into %dest : tensor -> tensor + return %1 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [4, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK: func.func @perfect_NPQK_to_NKPQk +// CHECK-SAME: %[[SOURCE:.+]]: tensor<1x6x6x8xf32>, +// CHECK-SAME: %{{.+}}: tensor<1x4x6x6x2xf32>) +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %{{.+}} = scf.for %[[ARG2:.+]] = %[[C0]] to %[[C4]] step %[[C1]] +// CHECK: %{{.+}} = scf.for %[[ARG4:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %{{.+}} = scf.for %[[ARG6:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[MAP1]](%[[ARG2]]) +// CHECK: %[[SLICE_SOURCE:.+]] = tensor.extract_slice %[[SOURCE]][0, %[[ARG4]], %[[ARG6]], %[[APPLY]]] +// CHECK: %[[SLICE_DEST:.+]] = tensor.extract_slice %{{.+}}[0, %[[ARG2]], %[[ARG4]], %[[ARG6]], 0] +// CHECK: %[[PACK:.+]] = linalg.pack +// CHECK-SAME: %[[SLICE_SOURCE]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] +// CHECK-SAME: into %[[SLICE_DEST]] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[PACK]] +// CHECK-SAME: into %{{.+}}[0, %[[ARG2]], %[[ARG4]], %[[ARG6]], 0] +// CHECK: scf.yield %[[RES]] + +func.func @perfect_NPQK_to_NKPQk(%source: tensor<1x6x6x8xf32>, %dest: tensor<1x4x6x6x2xf32>) -> tensor<1x4x6x6x2xf32> { + %0 = linalg.pack %source outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %dest : tensor<1x6x6x8xf32> -> tensor<1x4x6x6x2xf32> + return %0 : tensor<1x4x6x6x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} diff --git a/mlir/test/Dialect/Linalg/transform-pack-greedily.mlir b/mlir/test/Dialect/Linalg/transform-pack-greedily.mlir index 100692426ef44..5812c4db88247 100644 --- a/mlir/test/Dialect/Linalg/transform-pack-greedily.mlir +++ b/mlir/test/Dialect/Linalg/transform-pack-greedily.mlir @@ -378,11 +378,11 @@ func.func @no_padding_on_packs(%A: tensor<32x32xf32>, %B: tensor<32x32xf32>, %C: } // CHECK-LABEL: no_padding_on_packs -// CHECK: tensor.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 4] +// CHECK: linalg.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 4] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<4x8x8x4xf32> -// CHECK: tensor.pack %{{.+}} outer_dims_perm = [1, 0] +// CHECK: linalg.pack %{{.+}} outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %{{.+}} : tensor<32x32xf32> -> tensor<2x8x4x16xf32> -// CHECK: tensor.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] +// CHECK: linalg.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<4x2x8x16xf32> module attributes {transform.with_named_sequence} { @@ -393,12 +393,12 @@ module attributes {transform.with_named_sequence} { matmul_packed_sizes = [8, 16, 4] matmul_inner_dims_order = [0, 1, 2] : (!transform.op<"linalg.matmul">) -> !transform.op<"linalg.generic"> %pack = transform.get_producer_of_operand %1[1] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.pack">) %2, %pack_2, %empty_unpack_2 = transform.structured.pack_transpose %pack with_compute_op(%1) outer_perm = [1, 0] inner_perm = [1, 0] - : (!transform.op<"tensor.pack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.any_op) + : (!transform.op<"linalg.pack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.any_op) transform.yield } } diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir index faf7ff9ad7ed0..5d4ae4f15d3fd 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir @@ -14,7 +14,7 @@ module { func.func @fuse_pack_as_producer(%src: tensor<128x256xf32>, %other: tensor<4x4x128x256xf32>) -> tensor<4x4x128x256xf32> { %dest = tensor.empty() : tensor<1x1x128x256xf32> - %pack = tensor.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256] + %pack = linalg.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %dest : tensor<128x256xf32> -> tensor<1x1x128x256xf32> %out = tensor.empty() : tensor<4x4x128x256xf32> @@ -36,10 +36,10 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { // Find and lower pack operation. - %pack = transform.structured.match ops{["tensor.pack"]} in %arg1 - : (!transform.any_op) -> !transform.op<"tensor.pack"> + %pack = transform.structured.match ops{["linalg.pack"]} in %arg1 + : (!transform.any_op) -> !transform.op<"linalg.pack"> %paded, %expanded, %transpose = transform.structured.lower_pack %pack {lowerPadLikeWithInsertSlice = false} - : (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) @@ -72,7 +72,7 @@ module { func.func @fuse_pack_as_producer_blocked_by_insert_slice(%src: tensor<128x256xf32>, %other: tensor<4x4x128x256xf32>) -> tensor<4x4x128x256xf32> { %dest = tensor.empty() : tensor<1x1x128x256xf32> - %pack = tensor.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256] + %pack = linalg.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %dest : tensor<128x256xf32> -> tensor<1x1x128x256xf32> %out = tensor.empty() : tensor<4x4x128x256xf32> @@ -94,10 +94,10 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { // Find and lower pack operation. - %pack = transform.structured.match ops{["tensor.pack"]} in %arg1 - : (!transform.any_op) -> !transform.op<"tensor.pack"> + %pack = transform.structured.match ops{["linalg.pack"]} in %arg1 + : (!transform.any_op) -> !transform.op<"linalg.pack"> %paded, %expanded, %transpose = transform.structured.lower_pack %pack - : (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) @@ -143,7 +143,7 @@ module { } -> tensor<1x1x128x256xf32> %dest = tensor.empty() : tensor<128x256xf32> - %unpack = tensor.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256] + %unpack = linalg.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %dest : tensor<1x1x128x256xf32> -> tensor<128x256xf32> return %unpack : tensor<128x256xf32> @@ -152,10 +152,10 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { // Find and lower unpack operation. - %unpack = transform.structured.match ops{["tensor.unpack"]} in %arg1 - : (!transform.any_op) -> !transform.op<"tensor.unpack"> + %unpack = transform.structured.match ops{["linalg.unpack"]} in %arg1 + : (!transform.any_op) -> !transform.op<"linalg.unpack"> transform.structured.lower_unpack %unpack {lowerUnpadLikeWithExtractSlice = false} - : (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -204,7 +204,7 @@ module { } -> tensor<1x1x128x256xf32> %dest = tensor.empty() : tensor<128x256xf32> - %unpack = tensor.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256] + %unpack = linalg.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %dest : tensor<1x1x128x256xf32> -> tensor<128x256xf32> return %unpack : tensor<128x256xf32> @@ -213,10 +213,10 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { // Find and lower unpack operation. - %unpack = transform.structured.match ops{["tensor.unpack"]} in %arg1 - : (!transform.any_op) -> !transform.op<"tensor.unpack"> + %unpack = transform.structured.match ops{["linalg.unpack"]} in %arg1 + : (!transform.any_op) -> !transform.op<"linalg.unpack"> transform.structured.lower_unpack %unpack - : (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir index 8fbc74ec345c6..8f3b199145ce0 100644 --- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir @@ -115,13 +115,13 @@ module attributes {transform.with_named_sequence} { func.func @test_pack_no_vectorize_dynamic_shape(%arg0: tensor, %arg1: tensor<4x16xf32>) -> tensor<4x16xf32> { %pad = arith.constant 0.000000e+00 : f32 // expected-error @+1 {{Attempted to vectorize, but failed}} - %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [0] inner_tiles = [16] into %arg1 : tensor -> tensor<4x16xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [0] inner_tiles = [16] into %arg1 : tensor -> tensor<4x16xf32> return %pack : tensor<4x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir index 5ae3f893c2e73..9f2ee47b45b3e 100644 --- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir @@ -1944,13 +1944,13 @@ module attributes {transform.with_named_sequence} { // masking was used. func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { - %pack = tensor.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> + %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> return %pack : tensor<4x1x32x16x2xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op transform.yield @@ -1977,7 +1977,7 @@ module attributes {transform.with_named_sequence} { func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> } @@ -1995,7 +1995,7 @@ func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor< module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op transform.yield diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index 6d39262945de5..c6d9ec6215715 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -671,7 +671,7 @@ module attributes {transform.with_named_sequence} { // masking was used. func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { - %pack = tensor.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> + %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> return %pack : tensor<4x1x32x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -688,7 +688,7 @@ func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op transform.yield } @@ -702,7 +702,7 @@ module attributes {transform.with_named_sequence} { func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -725,7 +725,7 @@ func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor< module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [32, 4, 1] : !transform.any_op transform.yield } @@ -734,7 +734,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @test_vectorize_dynamic_pack(%arg0: tensor, %arg1: tensor) -> tensor { - %pack = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg1 : tensor -> tensor + %pack = linalg.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg1 : tensor -> tensor return %pack : tensor } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -766,7 +766,7 @@ func.func @test_vectorize_dynamic_pack(%arg0: tensor, %arg1: tensor !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [4, 1] : !transform.any_op transform.yield } @@ -893,12 +893,12 @@ func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor, %arg1: t // CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1> // CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]] // CHECK: return %[[write0]] - %ret = tensor.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor -> tensor + %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor -> tensor return %ret : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op transform.yield } @@ -925,12 +925,12 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2 // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1> // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op transform.yield } @@ -949,12 +949,12 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // CHECK: %[[C00:.*]] = arith.constant 0 : index // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op transform.yield } @@ -973,12 +973,12 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // CHECK: %[[C00:.*]] = arith.constant 0 : index // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op transform.yield } @@ -988,7 +988,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // CHECK-LABEL: test_vectorize_pack_no_vector_sizes func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> { - %pack = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32> + %pack = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32> return %pack : tensor<2x4x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -1005,7 +1005,7 @@ func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: t module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } @@ -1016,7 +1016,7 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -1033,7 +1033,7 @@ func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } @@ -1051,12 +1051,12 @@ func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, // CHECK: %[[C00:.*]] = arith.constant 0 : index // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } @@ -1075,12 +1075,12 @@ func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[EMPT]]{{\[}}%[[C00]], %[[C00]]] // CHECK-SAME: {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32> // CHECK: return %[[WRIT]] : tensor<64x127xf32> - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32> + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32> return %0 : tensor<64x127xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } @@ -1089,7 +1089,7 @@ func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x // ----- func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> { - %0 = tensor.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32> + %0 = linalg.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32> return %0 : tensor<7x16xf32> } // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 @@ -1103,7 +1103,7 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf // CHECK: return %[[WRIT]] : tensor<7x16xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 01d14871072cd..90cc0ca658ffb 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -899,225 +899,6 @@ func.func @fold_extract_constant_splat() -> (tensor<4x4xi32>) { // ----- -// CHECK-LABEL: func @fold_pack_constant_splat -// CHECK-NOT: tensor.pack -// CHECK: arith.constant dense<1.000000e-01> : tensor<8x16x8x32xf32> -func.func @fold_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { - %cst = arith.constant dense<1.000000e-01> : tensor<64x128xf32> - %0 = tensor.pack %cst outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest : tensor<64x128xf32> -> tensor<8x16x8x32xf32> - return %0 : tensor<8x16x8x32xf32> -} - -// ----- - -// CHECK-LABEL: func @fold_padding_value_pack_constant_splat -// CHECK-NOT: tensor.pack -// CHECK: arith.constant dense<1.000000e-01> : tensor<8x16x8x32xf32> -func.func @fold_padding_value_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { - %pad = arith.constant 1.000000e-01 : f32 - %cst = arith.constant dense<1.000000e-01> : tensor<63x127xf32> - %0 = tensor.pack %cst - padding_value(%pad : f32) - outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest : tensor<63x127xf32> -> tensor<8x16x8x32xf32> - return %0 : tensor<8x16x8x32xf32> -} - - -// ----- - -// CHECK-LABEL: func @nofold_padding_value_pack_constant_splat -// CHECK: arith.constant dense<1.000000e-01> : tensor<63x127xf32> -// CHECK: tensor.pack -func.func @nofold_padding_value_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { - %pad = arith.constant 0.0 : f32 - %cst = arith.constant dense<1.000000e-01> : tensor<63x127xf32> - %0 = tensor.pack %cst - padding_value(%pad : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [0, 1] - inner_tiles = [8, 32] - into %dest : tensor<63x127xf32> -> tensor<8x16x8x32xf32> - return %0 : tensor<8x16x8x32xf32> -} - -// ----- - -func.func @fold_padding_value_pack(%arg0: tensor<1200x500000xf32>) -> tensor<31250x1200x16x1xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<31250x1200x16x1xf32> - %pack = tensor.pack %arg0 - padding_value(%cst : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [1, 0] - inner_tiles = [16, 1] - into %0 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32> - return %pack : tensor<31250x1200x16x1xf32> -} -// CHECK-LABEL: func @fold_padding_value_pack -// CHECK-NOT: padding_value - -// ----- - -func.func @infer_src_shape_pack(%src: tensor, %dest: tensor<10x20x30x40x16xf32>) -> tensor<10x20x30x40x16xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %src - padding_value(%cst : f32) - outer_dims_perm = [2, 1, 3, 0] - inner_dims_pos = [2] - inner_tiles = [16] - into %dest : tensor -> tensor<10x20x30x40x16xf32> - return %pack : tensor<10x20x30x40x16xf32> -} -// CHECK-LABEL: func.func @infer_src_shape_pack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] -// CHECK: %[[CAST_SRC:.+]] = tensor.cast %[[SRC]] : tensor to tensor<40x20x?x30xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[CAST_SRC]] {{.+}} into %[[DEST]] -// CHECK: return %[[PACK]] - -// ----- - -func.func @infer_dest_shape_pack(%src: tensor<30x20x?x10xf32>, %dest: tensor) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %src - padding_value(%cst : f32) - outer_dims_perm = [2, 1, 3, 0] - inner_dims_pos = [2] - inner_tiles = [16] - into %dest : tensor<30x20x?x10xf32> -> tensor - return %pack : tensor -} -// CHECK-LABEL: func.func @infer_dest_shape_pack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] -// CHECK: %[[CAST_DEST:.+]] = tensor.cast %[[DEST]] : tensor to tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[SRC]] {{.+}} into %[[CAST_DEST]] -// CHECK: %[[CAST_PACK:.+]] = tensor.cast %[[PACK]] : tensor to tensor -// CHECK: return %[[CAST_PACK]] - -// ----- - -func.func @no_infer_pack_shape(%arg0: tensor, %arg1: index) -> tensor<32x7x?x16x1xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty(%arg1) : tensor<32x7x?x16x1xf32> - %pack = tensor.pack %arg0 padding_value(%cst : f32) outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %0 : tensor -> tensor<32x7x?x16x1xf32> - return %pack : tensor<32x7x?x16x1xf32> -} -// CHECK-LABEL: func.func @no_infer_pack_shape -// CHECK-NOT: tensor.cast - -// ----- - -func.func @fold_padding_value_pack_negative1(%arg0: tensor<1200x499999xf32>) -> tensor<31250x1200x16x1xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<31250x1200x16x1xf32> - %pack = tensor.pack %arg0 - padding_value(%cst : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [1, 0] - inner_tiles = [16, 1] - into %0 : tensor<1200x499999xf32> -> tensor<31250x1200x16x1xf32> - return %pack : tensor<31250x1200x16x1xf32> -} -// CHECK-LABEL: func @fold_padding_value_pack_negative1 -// CHECK: tensor.pack -// CHECK-SAME: padding_value - -// ----- - -func.func @fold_padding_value_pack_negative2(%arg0: tensor<1200x?xf32>, %arg1: tensor) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 - padding_value(%cst : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [1, 0] - inner_tiles = [16, 1] - into %arg1 : tensor<1200x?xf32> -> tensor - return %pack : tensor -} -// CHECK-LABEL: func @fold_padding_value_pack_negative2 -// CHECK: tensor.pack -// CHECK-SAME: padding_value - -// ----- - -func.func @fold_padding_value_pack_negative3(%arg0: tensor<1200x500000xf32>, %arg1: tensor, %tile : index) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 - padding_value(%cst : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [1, 0] - inner_tiles = [%tile, 1] - into %arg1 : tensor<1200x500000xf32> -> tensor - return %pack : tensor -} -// CHECK-LABEL: func @fold_padding_value_pack_negative3 -// CHECK: tensor.pack -// CHECK-SAME: padding_value - -// ----- - -// CHECK-LABEL: func @fold_unpack_constant_splat -// CHECK-NOT: tensor.unpack -// CHECK: arith.constant dense<1.000000e-01> : tensor<128x256xf32> -func.func @fold_unpack_constant_splat(%dest : tensor<128x256xf32>) -> tensor<128x256xf32> { - %cst = arith.constant dense<1.000000e-01> : tensor<16x8x8x32xf32> - %0 = tensor.unpack %cst inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32> - return %0 : tensor<128x256xf32> -} - -// ----- - -func.func @infer_dest_shape_unpack(%src: tensor<10x20x30x40x16xf32>, %dest: tensor) -> tensor { - %unpack = tensor.unpack %src - outer_dims_perm = [2, 1, 3, 0] - inner_dims_pos = [2] - inner_tiles = [16] - into %dest : tensor<10x20x30x40x16xf32> -> tensor - return %unpack : tensor -} -// CHECK-LABEL: func.func @infer_dest_shape_unpack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] -// CHECK: %[[CAST_DEST:.+]] = tensor.cast %[[DEST]] : tensor to tensor<40x20x?x30xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[SRC]] {{.+}} into %[[CAST_DEST]] -// CHECK: %[[CAST_UNPACK:.+]] = tensor.cast %[[UNPACK]] : tensor<40x20x?x30xf32> to tensor -// CHECK: return %[[CAST_UNPACK]] - -// ----- - -func.func @infer_src_shape_unpack(%src: tensor, %dest: tensor<30x20x?x10xf32>) -> tensor<30x20x?x10xf32> { - %unpack = tensor.unpack %src - outer_dims_perm = [2, 1, 3, 0] - inner_dims_pos = [2] - inner_tiles = [16] - into %dest : tensor -> tensor<30x20x?x10xf32> - return %unpack : tensor<30x20x?x10xf32> -} -// CHECK-LABEL: func.func @infer_src_shape_unpack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] -// CHECK: %[[CAST_SRC:.+]] = tensor.cast %[[SRC]] : tensor to tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[CAST_SRC]] -// CHECK: return %[[UNPACK]] - -// ----- - -func.func @no_infer_unpack_shape(%arg1: tensor<32x7x?x16x1xf32>, %arg2: index) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty(%arg2) : tensor - %unpack = tensor.unpack %arg1 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %0 : tensor<32x7x?x16x1xf32> -> tensor - return %unpack : tensor -} -// CHECK-LABEL: func.func @no_infer_unpack_shape -// CHECK-NOT: tensor.cast - -// ----- - - // CHECK-LABEL: func @fold_overlapping_insert // CHECK-SAME: %[[INPUT:.+]]: tensor, %{{.+}}: tensor<4x?x8xf32>, %[[SLICE2:.+]]: tensor<4x?x8xf32> func.func @fold_overlapping_insert(%input : tensor, %slice1: tensor<4x?x8xf32>, %slice2: tensor<4x?x8xf32>, %i: index, %size: index) -> (tensor) { @@ -2370,174 +2151,6 @@ func.func @collapse_expand_fold_to_cast(%t: tensor, %sz0: index) -> (tens // ----- -// Chain: NC -> NCnc -> NCnc -> NC -// CHECK: func.func @unpack_pack( -// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) -// CHECK: return %[[T]] : tensor<128x128xf32> -func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { - %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> - %packed = tensor.pack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> - %tensor_empty1 = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor<128x128xf32> - return %unpacked : tensor<128x128xf32> -} - -// ----- - -// Chain: NC -> NCcn -> NCnc -> NC -// CHECK: func.func @unpack_pack( -// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) -// CHECK-NOT: return %[[T]] : tensor<128x128xf32> -func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { - %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> - %packed = tensor.pack %t inner_dims_pos = [1, 0] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> - %tensor_empty1 = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor -<128x128xf32> - return %unpacked : tensor<128x128xf32> -} - -// ----- - -// Chain: NC -> CNcn -> NCnc -> NC -// CHECK: func.func @unpack_pack( -// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) -// CHECK-NOT: return %[[T]] : tensor<128x128xf32> -func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { - %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> - %packed = tensor.pack %t outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> - %tensor_empty1 = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor -<128x128xf32> - return %unpacked : tensor<128x128xf32> -} - -// ----- - -// Chain: NC -> NCnc -> NCnc -> NC -// CHECK: func.func @unpack_pack( -// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>, -// CHECK: return %[[T]] : tensor<128x128xf32> -func.func @unpack_pack(%t: tensor<128x128xf32>, %tile1: index, %tile2: index) -> tensor<128x128xf32> { - %tensor_empty = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> - %packed = tensor.pack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x?x?xf32> - %tensor_empty1 = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<16x16x?x?xf32> -> tensor -<128x128xf32> - return %unpacked : tensor<128x128xf32> -} - -// ----- - -// CHECK: func.func @unpack_pack_with_padding_no_canonicalization( -// CHECK: tensor.pack -// CHECK: tensor.unpack -func.func @unpack_pack_with_padding_no_canonicalization(%t: tensor<256x512xbf16>) -> tensor<224x512xbf16> { - %tensor_empty = tensor.empty() : tensor<4x16x64x32xbf16> - %tensor_empty1 = tensor.empty() : tensor<224x512xbf16> - %packed = tensor.pack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty : tensor<256x512xbf16> -> tensor<4x16x64x32xbf16> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty1 : tensor<4x16x64x32xbf16> -> tensor<224x512xbf16> - return %unpacked : tensor<224x512xbf16> -} - -// ----- - -// Chain NCnc -> NC -> NC -> NCnc -// CHECK: func.func @pack_unpack( -// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, -// CHECK: return %[[T]] : tensor<16x16x?x?xf32> -func.func @pack_unpack(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { - %tensor_empty = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> - %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> - return %packed : tensor<16x16x?x?xf32> -} - -// ----- - -// Chain NCnc -> NC -> NC -> NCnc -// CHECK: func.func @pack_unpack( -// CHECK-SAME: %[[T:.+]]: tensor<16x16x8x8xf32> -// CHECK: return %[[T]] : tensor<16x16x8x8xf32> -func.func @pack_unpack(%t: tensor<16x16x8x8xf32>) -> tensor<16x16x8x8xf32> { - %tensor_empty = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<16x16x8x8xf32> -> tensor<128x128xf32> - %tensor_empty1 = tensor.empty() : tensor<16x16x8x8xf32> - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x8x8xf32> - return %packed : tensor<16x16x8x8xf32> -} - -// ----- - -// CHECK: func.func @pack_unpack_same_tiles( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK: return %[[T]] : tensor -func.func @pack_unpack_same_tiles(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, - %tile1: index, %tile2: index) -> tensor { - %tensor_empty = tensor.empty(%dim1, %dim2) : tensor - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor - %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor -> tensor - return %packed : tensor -} - -// ----- - -// CHECK: func.func @pack_unpack_different_tiles( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK-NOT: return %[[T]] : tensor -func.func @pack_unpack_different_tiles(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, - %tile1: index, %tile2: index) -> tensor { - %tensor_empty = tensor.empty(%dim1, %dim2) : tensor - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor - %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile2, %tile1] into %tensor_empty1 : tensor -> tensor - return %packed : tensor -} - -// ----- - -// CHECK: func.func @pack_unpack_dynamic_with_padding( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK-NOT: return %[[T]] : tensor -func.func @pack_unpack_dynamic_with_padding(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, - %tile1: index, %tile2: index, %pad: f32) -> tensor { - %tensor_empty = tensor.empty(%dim1, %dim2) : tensor - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor - %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor - %packed = tensor.pack %unpacked padding_value(%pad: f32) inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor -> tensor - return %packed : tensor -} - -// ----- - -// CHECK: func.func @pack_outer_dims_unpack_no_outer_dims( -// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, -// CHECK: return %[[T]] : tensor<16x16x?x?xf32> -func.func @pack_outer_dims_unpack_no_outer_dims(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { - %tensor_empty = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> - %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> - %packed = tensor.pack %unpacked outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> - return %packed : tensor<16x16x?x?xf32> -} - -// ----- - -// CHECK: func.func @pack_no_outer_dims_unpack_outer_dims( -// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, -// CHECK: return %[[T]] : tensor<16x16x?x?xf32> -func.func @pack_no_outer_dims_unpack_outer_dims(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { - %tensor_empty = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> - %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> - return %packed : tensor<16x16x?x?xf32> -} - -// ----- - // CHECK: func.func @invalid_empty_negative_size // CHECK: %[[IDX:.*]] = index.constant // CHECK: %[[T:.*]] = tensor.empty(%[[IDX]]) : tensor<4x5x?xf32> @@ -2551,22 +2164,6 @@ func.func @invalid_empty_negative_size() -> (tensor<4x5x?xf32>) { // ----- -// Fold DstStyleOp -> tensor.unpack operations. -func.func @fold_dst_style_ops_into_unpack(%arg0 : tensor, %init : tensor) -> tensor { - %cst = arith.constant 0.0 : f32 - %fill = linalg.fill ins(%cst : f32) outs(%init : tensor) -> tensor - %unpack = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %fill : tensor -> tensor - return %unpack : tensor -} -// CHECK-LABEL: func @fold_dst_style_ops_into_unpack -// CHECK-SAME: %[[ARG0:.+]]: tensor -// CHECK-SAME: %[[INIT:.+]]: tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] -// CHECK-SAME: into %[[INIT]] -// CHECK: return %[[UNPACK]] - -// ----- - // The IR in this test case in invalid. This test tests that the canonicalizer // does not crash. @@ -2598,21 +2195,6 @@ func.func @generate_negative_size_verifies() -> tensor { return %tensor : tensor } -// ----- - -func.func @infer_and_fold_pack_unpack_same_tiles(%t: tensor<10x20x4x4xf32>) -> tensor<10x20x4x4xf32> { - %dim1 = arith.constant 40 : index - %dim2 = arith.constant 80 : index - %tensor_empty = tensor.empty(%dim1, %dim2) : tensor - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %tensor_empty : tensor<10x20x4x4xf32> -> tensor - %cast = tensor.cast %unpacked : tensor to tensor<40x80xf32> - %tensor_empty1 = tensor.empty() : tensor<10x20x4x4xf32> - %packed = tensor.pack %cast inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %tensor_empty1 : tensor<40x80xf32> -> tensor<10x20x4x4xf32> - return %packed : tensor<10x20x4x4xf32> -} -// CHECK-LABEL: func.func @infer_and_fold_pack_unpack_same_tiles -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK: return %[[SRC]] // ----- @@ -2787,62 +2369,6 @@ func.func @fold_cast_multiple_results(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2x return %0#1 : index } -// ----- - -// CHECK-LABEL: func.func @fold_cast_pack_dynamic_tile_size -// CHECK-SAME: %[[DEST:.*]]: tensor<1x1x8x1xi32>, -// CHECK-SAME: %[[SRC:.*]]: tensor<7x?xi32>, -// CHECK-SAME: %[[PAD:.*]]: i32) -> tensor<1x1x8x1xi32> { -// CHECK: %[[PACK:.*]] = tensor.pack %[[SRC]] padding_value(%[[PAD]] : i32) -// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] -// CHECK-SAME: test_attr -// CHECK-SAME: : tensor<7x?xi32> -> tensor<1x1x8x1xi32> -// CHECK: return %[[PACK]] : tensor<1x1x8x1xi32> -func.func @fold_cast_pack_dynamic_tile_size( - %dest: tensor<1x1x8x1xi32>, - %src: tensor<7x?xi32>, - %pad: i32) -> tensor<1x1x8x1xi32> { - - %cast = tensor.cast %dest : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> - %c8 = arith.constant 8 : index - %pack = tensor.pack %src padding_value(%pad : i32) - inner_dims_pos = [0, 1] - inner_tiles = [%c8, 1] - into %cast {test_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32> - %res = tensor.cast %pack : tensor<1x1x?x1xi32> to tensor<1x1x8x1xi32> - return %res : tensor<1x1x8x1xi32> -} - -// ----- - -// CHECK-LABEL: func.func @fold_cast_unpack_dynamic_tile_size( -// CHECK-SAME: %[[SRC:.*]]: tensor<1x1x8x1xi32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<7x?xi32>) -> tensor<7x?xi32> { -// CHECK: %[[RES:.*]] = tensor.unpack %[[SRC]] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] {test_attr} : tensor<1x1x8x1xi32> -> tensor<7x?xi32> -// CHECK: return %[[RES]] : tensor<7x?xi32> -func.func @fold_cast_unpack_dynamic_tile_size( - %src: tensor<1x1x8x1xi32>, - %res: tensor<7x?xi32>) -> tensor<7x?xi32> { - - %cast = tensor.cast %src : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> - %c8 = arith.constant 8 : index - %unpack = tensor.unpack %cast - inner_dims_pos = [0, 1] - inner_tiles = [%c8, 1] - into %res {test_attr} : tensor<1x1x?x1xi32> -> tensor<7x?xi32> - return %unpack : tensor<7x?xi32> -} - -// ----- - -// CHECK-LABEL: func.func @pack_dont_drop_attributes( -// CHECK: tensor.pack {{.*}} {test_attr} -func.func @pack_dont_drop_attributes(%arg0: tensor, %arg1: tensor<128x?x100x16x1xf16>) -> tensor<128x?x100x16x1xf16> { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f16 - %pack = tensor.pack %arg0 padding_value(%cst : f16) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %arg1 {test_attr} : tensor -> tensor<128x?x100x16x1xf16> - return %pack : tensor<128x?x100x16x1xf16> -} // ----- diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir index 850bbcee34020..7b11c9f43c7ec 100644 --- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir +++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir @@ -61,77 +61,6 @@ func.func @rank_reducing_empty_tensor_extract(%sz : index, %idx : index) -> tens return %r: tensor<2xf32> } -func.func @pack_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> { - %empty_unpacked = tensor.empty() : tensor<256x256xf32> - %packed = tensor.pack %empty_unpacked - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32> - return %packed : tensor<8x8x32x32xf32> -} - -// CHECK-LABEL: func.func @pack_empty( -// CHECK-SAME: %[[T:.+]]: tensor<8x8x32x32xf32> -// CHECK-NOT: tensor.pack -// CHECK: return %[[T]] : tensor<8x8x32x32xf32> - -func.func @pack_empty_dynamic(%arg0: tensor, %dim0: index, %dim1: index) -> tensor { - %empty_unpacked = tensor.empty(%dim0, %dim1) : tensor - %packed = tensor.pack %empty_unpacked - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor -> tensor - return %packed : tensor -} - -// CHECK-LABEL: func.func @pack_empty_dynamic( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK-SAME: %[[DIM0:[a-zA-Z0-9_]+]]: index, -// CHECK-SAME: %[[DIM1:[a-zA-Z0-9_]+]]: index -// CHECK-NOT: tensor.pack -// CHECK: return %[[T]] : tensor - -func.func @unpack_empty(%arg0: tensor<256x256xf32>) -> tensor<256x256xf32> { - %empty_packed = tensor.empty() : tensor<8x8x32x32xf32> - %unpacked = tensor.unpack %empty_packed - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor<8x8x32x32xf32> -> tensor<256x256xf32> - return %unpacked : tensor<256x256xf32> -} - -// CHECK-LABEL: func.func @unpack_empty( -// CHECK-SAME: %[[T:.+]]: tensor<256x256xf32> -// CHECK-NOT: tensor.unpack -// CHECK: return %[[T]] : tensor<256x256xf32> - -func.func @unpack_empty_dynamic(%arg0: tensor, %dim0: index, %dim1: index) -> tensor { - %empty_packed = tensor.empty(%dim0, %dim1) : tensor - %unpacked = tensor.unpack %empty_packed - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor -> tensor - return %unpacked : tensor -} - -// CHECK-LABEL: func.func @unpack_empty_dynamic( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK-SAME: %[[DIM0:[a-zA-Z0-9_]+]]: index, -// CHECK-SAME: %[[DIM1:[a-zA-Z0-9_]+]]: index -// CHECK-NOT: tensor.unpack -// CHECK: return %[[T]] : tensor - -func.func @pack_padded_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> { - %pad = arith.constant 1.0 : f32 - %empty_unpacked = tensor.empty() : tensor<256x256xf32> - %packed = tensor.pack %empty_unpacked - padding_value(%pad : f32) - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32> - return %packed : tensor<8x8x32x32xf32> -} - -// CHECK-LABEL: func.func @pack_padded_empty( -// CHECK-SAME: %[[T:.+]]: tensor<8x8x32x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack -// CHECK: return %[[PACK]] : tensor<8x8x32x32xf32> - // ----- module attributes {transform.with_named_sequence} { diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir index bff913f5f55fe..84eb60248b8be 100644 --- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir +++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir @@ -1,8 +1,8 @@ -// RUN: mlir-opt -split-input-file -test-tensor-transform-patterns=test-fold-into-pack-and-unpack %s | FileCheck %s +// RUN: mlir-opt -split-input-file -test-linalg-transform-patterns=test-fold-into-pack-and-unpack %s | FileCheck %s func.func @fold_unpack_slice(%arg0 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 : tensor -> tensor %1 = tensor.extract_slice %0[0, 0] [%arg2, %arg3] [1, 1] : tensor to tensor return %1 : tensor @@ -13,7 +13,7 @@ func.func @fold_unpack_slice(%arg0 : tensor, %arg1 : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [0, 1] inner_tiles = [8, 4] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] inner_dims_pos = [0, 1] inner_tiles = [8, 4] // CHECK-SAME: into %[[INIT]] // CHECK: return %[[UNPACK]] @@ -21,39 +21,39 @@ func.func @fold_unpack_slice(%arg0 : tensor, %arg1 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index, %arg4 : index) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 : tensor -> tensor %1 = tensor.extract_slice %0[0, %arg4] [%arg2, %arg3] [1, 1] : tensor to tensor return %1 : tensor } // CHECK-LABEL: func @nofold_unpack_slice_non_zero_offset( -// CHECK: %[[UNPACK:.+]] = tensor.unpack +// CHECK: %[[UNPACK:.+]] = linalg.unpack // CHECK: tensor.extract_slice %[[UNPACK]] // ----- func.func @nofold_unpack_slice_non_unit_stride(%arg0 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index, %arg4 : index) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 : tensor -> tensor %1 = tensor.extract_slice %0[0, 0] [%arg2, %arg3] [%arg4, 1] : tensor to tensor return %1 : tensor } // CHECK-LABEL: func @nofold_unpack_slice_non_unit_stride( -// CHECK: %[[UNPACK:.+]] = tensor.unpack +// CHECK: %[[UNPACK:.+]] = linalg.unpack // CHECK: tensor.extract_slice %[[UNPACK]] // ----- func.func @nofold_unpack_slice_rank_reduced(%arg0 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 : tensor -> tensor %1 = tensor.extract_slice %0[0, 0] [1, 1] [1, 1] : tensor to tensor return %1 : tensor } // CHECK-LABEL: func @nofold_unpack_slice_rank_reduced( -// CHECK: %[[UNPACK:.+]] = tensor.unpack +// CHECK: %[[UNPACK:.+]] = linalg.unpack // CHECK: tensor.extract_slice %[[UNPACK]] // ----- @@ -66,7 +66,7 @@ func.func @pad_pack(%src: tensor<16641x16xf32>) -> tensor<2082x1x8x32xf32> { tensor.yield %cst : f32 } : tensor<16641x16xf32> to tensor<16656x16xf32> %empty = tensor.empty() : tensor<2082x1x8x32xf32> - %pack = tensor.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty + %pack = linalg.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty : tensor<16656x16xf32> -> tensor<2082x1x8x32xf32> return %pack : tensor<2082x1x8x32xf32> } @@ -74,7 +74,7 @@ func.func @pad_pack(%src: tensor<16641x16xf32>) -> tensor<2082x1x8x32xf32> { // CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK: %[[PAD_VAL:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[DEST:.+]] = tensor.empty() : tensor<2082x1x8x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[SRC]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[SRC]] // CHECK-SAME: padding_value(%[[PAD_VAL]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %[[DEST]] @@ -88,13 +88,13 @@ func.func @nofold_pad_pack(%src: tensor<16641x16xf32>) -> tensor<2082x1x8x32xf32 tensor.yield %cst : f32 } : tensor<16641x16xf32> to tensor<16656x16xf32> %empty = tensor.empty() : tensor<2082x1x8x32xf32> - %pack = tensor.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty + %pack = linalg.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty : tensor<16656x16xf32> -> tensor<2082x1x8x32xf32> return %pack : tensor<2082x1x8x32xf32> } // CHECK-LABEL: func.func @nofold_pad_pack // CHECK: tensor.pad -// CHECK: tensor.pack +// CHECK: linalg.pack // ----- @@ -107,19 +107,19 @@ func.func @pad_pack_different_padding_value(%src: tensor<16641x16xf32>) -> tenso tensor.yield %cst0 : f32 } : tensor<16641x16xf32> to tensor<16656x16xf32> %empty = tensor.empty() : tensor<2082x1x8x32xf32> - %pack = tensor.pack %padded padding_value(%cst1 : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty + %pack = linalg.pack %padded padding_value(%cst1 : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty : tensor<16656x16xf32> -> tensor<2082x1x8x32xf32> return %pack : tensor<2082x1x8x32xf32> } // CHECK-LABEL: func.func @pad_pack_different_padding_value // CHECK: tensor.pad -// CHECK: tensor.pack +// CHECK: linalg.pack // ----- -func.func @tensor_pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x57x56x2x32xf32> { +func.func @linalg.pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x57x56x2x32xf32> { %0 = tensor.empty() : tensor<56x2x1x57x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [3] inner_tiles = [32] @@ -132,10 +132,10 @@ func.func @tensor_pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> t permutation = [2, 3, 0, 1, 4] return %transposed : tensor<1x57x56x2x32xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold( +// CHECK: func @linalg.pack_linalg_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -143,9 +143,9 @@ func.func @tensor_pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> t // ----- -func.func @tensor_pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x55xf32>, %padding: f32) -> tensor<1x57x56x2x32xf32> { +func.func @linalg.pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x55xf32>, %padding: f32) -> tensor<1x57x56x2x32xf32> { %0 = tensor.empty() : tensor<56x2x1x57x32xf32> - %pack = tensor.pack %arg0 padding_value(%padding : f32) + %pack = linalg.pack %arg0 padding_value(%padding : f32) outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [3] inner_tiles = [32] @@ -158,10 +158,10 @@ func.func @tensor_pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x permutation = [2, 3, 0, 1, 4] return %transposed : tensor<1x57x56x2x32xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold_with_padding( +// CHECK: func @linalg.pack_linalg_transpose_fold_with_padding( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x55xf32>, %[[PADDING:.+]]: f32) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] padding_value(%[[PADDING]] : f32) +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] padding_value(%[[PADDING]] : f32) // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -169,9 +169,9 @@ func.func @tensor_pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x // ----- -func.func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x2x56x57x32xf32> { +func.func @linalg.pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x2x56x57x32xf32> { %0 = tensor.empty() : tensor<56x57x1x2x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<56x57x1x64xf32> -> tensor<56x57x1x2x32xf32> @@ -183,10 +183,10 @@ func.func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56 permutation = [2, 3, 0, 1, 4] return %transposed : tensor<1x2x56x57x32xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm( +// CHECK: func @linalg.pack_linalg_transpose_fold_no_outer_dims_perm( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x2x56x57x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 3, 0, 1] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -194,9 +194,9 @@ func.func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56 // ----- -func.func @tensor_pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<56x72x24x128xf32>) -> tensor<12x56x4x9x32x8x2xf32> { +func.func @linalg.pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<56x72x24x128xf32>) -> tensor<12x56x4x9x32x8x2xf32> { %0 = tensor.empty() : tensor<4x9x12x56x8x2x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 outer_dims_perm = [3, 1, 2, 0] inner_dims_pos = [1, 2, 3] inner_tiles = [8, 2, 32] @@ -209,10 +209,10 @@ func.func @tensor_pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<5 permutation = [2, 3, 0, 1, 6, 4, 5] return %transposed : tensor<12x56x4x9x32x8x2xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold_tile_dims_transpose( +// CHECK: func @linalg.pack_linalg_transpose_fold_tile_dims_transpose( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x72x24x128xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<12x56x4x9x32x8x2xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 0, 3, 1] // CHECK-SAME: inner_dims_pos = [3, 1, 2] inner_tiles = [32, 8, 2] // CHECK-SAME: into %[[INIT]] @@ -220,9 +220,9 @@ func.func @tensor_pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<5 // ----- -func.func @tensor_pack_linalg_transpose_fold_tile_dims_outer_dims_transpose(%arg0: tensor<56x72x24x128xf32>) -> tensor<9x56x2x12x32x8x4xf32> { +func.func @linalg.pack_linalg_transpose_fold_tile_dims_outer_dims_transpose(%arg0: tensor<56x72x24x128xf32>) -> tensor<9x56x2x12x32x8x4xf32> { %0 = tensor.empty() : tensor<4x12x9x56x8x2x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 outer_dims_perm = [3, 2, 1, 0] inner_dims_pos = [1, 2, 3] inner_tiles = [8, 2, 32] @@ -235,16 +235,16 @@ func.func @tensor_pack_linalg_transpose_fold_tile_dims_outer_dims_transpose(%arg permutation = [2, 3, 5, 1, 6, 4, 0] return %transposed : tensor<9x56x2x12x32x8x4xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold_tile_dims_outer_dims_transpose( +// CHECK: func @linalg.pack_linalg_transpose_fold_tile_dims_outer_dims_transpose( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x72x24x128xf32>) -// CHECK: tensor.pack +// CHECK: linalg.pack // CHECK: linalg.transpose // ----- -func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56x?x?x64xf32>) -> tensor { +func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56x?x?x64xf32>) -> tensor { %0 = tensor.empty() : tensor<56x2x1x57x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [3] inner_tiles = [32] @@ -259,14 +259,14 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56 %return_value = tensor.cast %transposed : tensor<1x57x56x2x32xf32> to tensor return %return_value : tensor } -// CHECK: func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims( +// CHECK: func @linalg.pack_linalg_transpose_fold_dynamic_outer_dims( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x?x?x64xf32>) // CHECK-DAG: %[[c1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[c2:.+]] = arith.constant 2 : index // CHECK: %[[dim:.+]] = tensor.dim %[[ARG0]], %[[c1]] : tensor<56x?x?x64xf32> // CHECK: %[[dim_0:.+]] = tensor.dim %[[ARG0]], %[[c2]] : tensor<56x?x?x64xf32> // CHECK: %[[INIT:.+]] = tensor.empty(%[[dim_0]], %[[dim]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -274,9 +274,9 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56 // ----- -func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0: tensor<56x?x?x128xf32>) -> tensor { +func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0: tensor<56x?x?x128xf32>) -> tensor { %0 = tensor.empty() : tensor<56x9x12x4x8x2x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 inner_dims_pos = [1, 2, 3] inner_tiles = [8, 2, 32] into %0 : tensor<56x?x?x128xf32> -> tensor<56x9x12x4x8x2x32xf32> @@ -292,7 +292,7 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0: } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> -// CHECK-LABEL: func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims( +// CHECK-LABEL: func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_and_tile_dims( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x?x?x128xf32>) // CHECK-DAG: %[[c1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[c2:.+]] = arith.constant 2 : index @@ -301,15 +301,15 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0: // CHECK: %[[mapped_dim1:.+]] = affine.apply #[[$MAP0]]()[%[[dim]]] // CHECK: %[[mapped_dim2:.+]] = affine.apply #[[$MAP1]]()[%[[dim_0]]] // CHECK: %[[INIT:.+]] = tensor.empty(%[[mapped_dim2]], %[[mapped_dim1]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [2, 3, 0, 1] inner_dims_pos = [3, 1, 2] inner_tiles = [32, 8, 2] into %[[INIT]] : tensor<56x?x?x128xf32> -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [2, 3, 0, 1] inner_dims_pos = [3, 1, 2] inner_tiles = [32, 8, 2] into %[[INIT]] : tensor<56x?x?x128xf32> -> tensor // CHECK: %[[CAST:.+]] = tensor.cast %[[PACK]] : tensor to tensor // CHECK: return %[[CAST]] : tensor // CHECK: } // ----- -func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %pack_dest: tensor, %transpose_dest: tensor, %tile_p : index, %tile_q : index, %tile_r : index) -> tensor { - %pack = tensor.pack %arg0 +func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %pack_dest: tensor, %transpose_dest: tensor, %tile_p : index, %tile_q : index, %tile_r : index) -> tensor { + %pack = linalg.pack %arg0 outer_dims_perm = [3, 0, 2, 1] inner_dims_pos = [1, 2, 3] inner_tiles = [%tile_p, %tile_q, %tile_r] @@ -324,7 +324,7 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_s } // CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> // CHECK: module { -// CHECK: func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_sizes( +// CHECK: func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_sizes( // CHECK-SAME: %[[ARG0:.+]]: tensor, // CHECK-SAME: %[[PACK_DEST:.+]]: tensor, %[[TRANSPOSE_DEST:.+]]: tensor, // CHECK-SAME: %[[ARG1:.+]]: index, %[[ARG2:.+]]: index, @@ -341,13 +341,13 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_s // CHECK: %[[mapped_dim1:.+]] = affine.apply #[[$MAP]]()[%[[dim_0]], %[[ARG1]]] // CHECK: %[[mapped_dim2:.+]] = affine.apply #[[$MAP]]()[%[[dim_1]], %[[ARG2]]] // CHECK: %[[INIT:.+]] = tensor.empty(%[[mapped_dim2]], %[[mapped_dim1]], %[[mapped_dim0]], %[[dim]], %[[ARG3]], %[[ARG1]], %[[ARG2]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [2, 1, 3, 0] inner_dims_pos = [3, 1, 2] inner_tiles = [%[[ARG3]], %[[ARG1]], %[[ARG2]]] into %[[INIT]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [2, 1, 3, 0] inner_dims_pos = [3, 1, 2] inner_tiles = [%[[ARG3]], %[[ARG1]], %[[ARG2]]] into %[[INIT]] : tensor -> tensor // CHECK: return %[[PACK]] : tensor // CHECK: } // ----- -func.func @linalg_transpose_tensor_pack_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x57x56x2x32xf32> { +func.func @linalg_transpose_linalg.pack_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x57x56x2x32xf32> { %0 = tensor.empty() : tensor<1x56x57x64xf32> %transposed = linalg.transpose ins(%arg0 : tensor<56x57x1x64xf32>) @@ -355,17 +355,17 @@ func.func @linalg_transpose_tensor_pack_fold(%arg0: tensor<56x57x1x64xf32>) -> t permutation = [2, 0, 1, 3] %1 = tensor.empty() : tensor<1x57x56x2x32xf32> - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed outer_dims_perm = [0, 2, 1, 3] inner_dims_pos = [3] inner_tiles = [32] into %1 : tensor<1x56x57x64xf32> -> tensor<1x57x56x2x32xf32> return %pack : tensor<1x57x56x2x32xf32> } -//CHECK-LABEL: func @linalg_transpose_tensor_pack_fold( +//CHECK-LABEL: func @linalg_transpose_linalg.pack_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -373,7 +373,7 @@ func.func @linalg_transpose_tensor_pack_fold(%arg0: tensor<56x57x1x64xf32>) -> t // ----- -func.func @linalg_transpose_tensor_pack_fold_with_padding(%arg0: tensor<56x57x1x55xf32>, %padding: f32) -> tensor<1x57x56x2x32xf32> { +func.func @linalg_transpose_linalg.pack_fold_with_padding(%arg0: tensor<56x57x1x55xf32>, %padding: f32) -> tensor<1x57x56x2x32xf32> { %0 = tensor.empty() : tensor<1x56x57x55xf32> %transpose = linalg.transpose ins(%arg0 : tensor<56x57x1x55xf32>) @@ -381,17 +381,17 @@ func.func @linalg_transpose_tensor_pack_fold_with_padding(%arg0: tensor<56x57x1x permutation = [2, 0, 1, 3] %1 = tensor.empty() : tensor<1x57x56x2x32xf32> - %pack = tensor.pack %transpose padding_value(%padding : f32) + %pack = linalg.pack %transpose padding_value(%padding : f32) outer_dims_perm = [0, 2, 1, 3] inner_dims_pos = [3] inner_tiles = [32] into %1 : tensor<1x56x57x55xf32> -> tensor<1x57x56x2x32xf32> return %pack : tensor<1x57x56x2x32xf32> } -//CHECK-LABEL: func @linalg_transpose_tensor_pack_fold_with_padding( +//CHECK-LABEL: func @linalg_transpose_linalg.pack_fold_with_padding( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x55xf32>, %[[PADDING:.+]]: f32) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] padding_value(%[[PADDING]] : f32) +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] padding_value(%[[PADDING]] : f32) // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -399,7 +399,7 @@ func.func @linalg_transpose_tensor_pack_fold_with_padding(%arg0: tensor<56x57x1x // ----- -func.func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x56x57x2x32xf32> { +func.func @linalg_transpose_linalg.pack_fold_no_outer_dims_perm(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x56x57x2x32xf32> { %0 = tensor.empty() : tensor<1x56x57x64xf32> %transposed = linalg.transpose ins(%arg0 : tensor<56x57x1x64xf32>) @@ -407,16 +407,16 @@ func.func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm(%arg0: tensor<56 permutation = [2, 0, 1, 3] %1 = tensor.empty() : tensor<1x56x57x2x32xf32> - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed inner_dims_pos = [3] inner_tiles = [32] into %1 : tensor<1x56x57x64xf32> -> tensor<1x56x57x2x32xf32> return %pack : tensor<1x56x57x2x32xf32> } -//CHECK-LABEL: func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm( +//CHECK-LABEL: func @linalg_transpose_linalg.pack_fold_no_outer_dims_perm( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x56x57x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 0, 1, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -424,25 +424,25 @@ func.func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm(%arg0: tensor<56 // ----- -func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change(%arg0: tensor<25x30x35x40xf32>, %transpose_dest: tensor<35x40x25x30xf32>, %pack_dest: tensor<3x35x5x8x5x10x5xf32>) -> tensor<3x35x5x8x5x10x5xf32> { +func.func @linalg_transpose_linalg.pack_fold_complex_inner_dims_change(%arg0: tensor<25x30x35x40xf32>, %transpose_dest: tensor<35x40x25x30xf32>, %pack_dest: tensor<3x35x5x8x5x10x5xf32>) -> tensor<3x35x5x8x5x10x5xf32> { %transposed = linalg.transpose ins(%arg0 : tensor<25x30x35x40xf32>) outs(%transpose_dest : tensor<35x40x25x30xf32>) permutation = [2, 3, 0, 1] - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed outer_dims_perm = [3, 0, 2, 1] inner_dims_pos = [1, 3, 2] inner_tiles = [5, 10, 5] into %pack_dest : tensor<35x40x25x30xf32> -> tensor<3x35x5x8x5x10x5xf32> return %pack : tensor<3x35x5x8x5x10x5xf32> } -//CHECK-LABEL: func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change( +//CHECK-LABEL: func.func @linalg_transpose_linalg.pack_fold_complex_inner_dims_change( // CHECK-SAME: %[[ARG0:.+]]: tensor<25x30x35x40xf32>, // CHECK-SAME: %[[ARG1:.+]]: tensor<35x40x25x30xf32>, // CHECK-SAME: %[[ARG2:.+]]: tensor<3x35x5x8x5x10x5xf32>) -> tensor<3x35x5x8x5x10x5xf32> { // CHECK: %[[VAL0:.+]] = tensor.empty() : tensor<3x35x5x8x5x10x5xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 2, 0, 3] // CHECK-SAME: inner_dims_pos = [3, 1, 0] // CHECK-SAME: inner_tiles = [5, 10, 5] @@ -451,13 +451,13 @@ func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change(%arg0: te // ----- -func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %pack_dest: tensor, %tile_p : index, %tile_q : index, %tile_r : index) -> tensor { +func.func @linalg_transpose_linalg.pack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %pack_dest: tensor, %tile_p : index, %tile_q : index, %tile_r : index) -> tensor { %transposed = linalg.transpose ins(%arg0 : tensor) outs(%transpose_dest : tensor) permutation = [2, 3, 0, 1] - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed outer_dims_perm = [3, 0, 2, 1] inner_dims_pos = [1, 3, 2] inner_tiles = [%tile_p, %tile_q, %tile_r] @@ -465,7 +465,7 @@ func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_s return %pack : tensor } // CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> -//CHECK-LABEL: func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_sizes( +//CHECK-LABEL: func.func @linalg_transpose_linalg.pack_fold_dynamic_outer_dims_tile_dims_tile_sizes( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor, // CHECK-SAME: %[[ARG2:.+]]: tensor, %[[ARG3:.+]]: index, %[[ARG4:.+]]: index, %[[ARG5:.+]]: index) -> tensor { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index @@ -480,12 +480,12 @@ func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_s // CHECK: %[[VAL1:.+]] = affine.apply #[[$MAP]]()[%[[DIM0]], %[[ARG4]]] // CHECK: %[[VAL2:.+]] = affine.apply #[[$MAP]]()[%[[DIM]], %[[ARG5]]] // CHECK: %[[VAL3:.+]] = tensor.empty(%[[VAL1]], %[[DIM1]], %[[VAL2]], %[[VAL0]], %[[ARG3]], %[[ARG4]], %[[ARG5]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [1, 2, 0, 3] inner_dims_pos = [3, 1, 0] inner_tiles = [%[[ARG3]], %[[ARG4]], %[[ARG5]]] into %[[VAL3]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [1, 2, 0, 3] inner_dims_pos = [3, 1, 0] inner_tiles = [%[[ARG3]], %[[ARG4]], %[[ARG5]]] into %[[VAL3]] : tensor -> tensor // CHECK: return %[[PACK]] : tensor // ----- -func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor) -> tensor<32x?x64x16x2xbf16> { +func.func @linalg_transpose_linalg.pack_multiple_tiles(%arg0: tensor) -> tensor<32x?x64x16x2xbf16> { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : bf16 %dim = tensor.dim %arg0, %c0 : tensor @@ -497,7 +497,7 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed padding_value(%cst : bf16) outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] @@ -506,14 +506,14 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor } // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> -//CHECK-LABEL: func.func @linalg_transpose_tensor_pack_multiple_tiles( +//CHECK-LABEL: func.func @linalg_transpose_linalg.pack_multiple_tiles( // CHECK-SAME: %[[ARG0:.+]]: tensor) -> tensor<32x?x64x16x2xbf16> { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : bf16 // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[VAL0:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]] // CHECK: %[[VAL1:.+]] = tensor.empty(%[[VAL0]]) : tensor<32x?x64x16x2xbf16> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: padding_value(%[[CST]] : bf16) // CHECK-SAME: outer_dims_perm = [1, 0, 2] // CHECK-SAME: inner_dims_pos = [0, 2] @@ -524,23 +524,23 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor) -> tensor<16x4xi32> { +func.func @linalg_transpose_linalg.unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> tensor<16x4xi32> { %0 = tensor.empty() : tensor<1x1x16x4xi32> %transposed = linalg.transpose ins(%arg0 : tensor<1x1x4x16xi32>) outs(%0 : tensor<1x1x16x4xi32>) permutation = [1, 0, 3, 2] %1 = tensor.empty() : tensor<16x4xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 4] into %1 : tensor<1x1x16x4xi32> -> tensor<16x4xi32> return %unpack : tensor<16x4xi32> } -//CHECK-LABEL: func.func @linalg_transpose_tensor_unpack_fold( +//CHECK-LABEL: func.func @linalg_transpose_linalg.unpack_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<1x1x4x16xi32>) -> tensor<16x4xi32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<16x4xi32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [1, 0] // CHECK-SAME: inner_tiles = [4, 16] @@ -550,23 +550,23 @@ func.func @linalg_transpose_tensor_unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> t // ----- -func.func @linalg_transpose_tensor_unpack_fold_partial_tile(%arg0: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> { +func.func @linalg_transpose_linalg.unpack_fold_partial_tile(%arg0: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> { %0 = tensor.empty() : tensor<1x1x16x4xi32> %transposed = linalg.transpose ins(%arg0 : tensor<1x1x4x16xi32>) outs(%0 : tensor<1x1x16x4xi32>) permutation = [1, 0, 3, 2] %1 = tensor.empty() : tensor<15x3xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 4] into %1 : tensor<1x1x16x4xi32> -> tensor<15x3xi32> return %unpack : tensor<15x3xi32> } -//CHECK-LABEL: func.func @linalg_transpose_tensor_unpack_fold_partial_tile( +//CHECK-LABEL: func.func @linalg_transpose_linalg.unpack_fold_partial_tile( // CHECK-SAME: %[[ARG0:.+]]: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<15x3xi32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [1, 0] // CHECK-SAME: inner_tiles = [4, 16] @@ -576,20 +576,20 @@ func.func @linalg_transpose_tensor_unpack_fold_partial_tile(%arg0: tensor<1x1x4x // ----- -func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %unpack_dest: tensor, %tile_p : index, %tile_q : index) -> tensor { +func.func @linalg_transpose_linalg.unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %unpack_dest: tensor, %tile_p : index, %tile_q : index) -> tensor { %transposed = linalg.transpose ins(%arg0 : tensor) outs(%transpose_dest : tensor) permutation = [1, 0, 3, 2] - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [%tile_p, %tile_q] into %unpack_dest : tensor -> tensor return %unpack : tensor } -// CHECK-LABEL: func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes( +// CHECK-LABEL: func.func @linalg_transpose_linalg.unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor, %[[ARG2:.+]]: tensor, // CHECK-SAME: %[[IDX1:.+]]: index, %[[IDX2:.+]]: index) -> tensor { // CHECK-DAG: %[[CST1:.+]] = arith.constant 1 : index @@ -597,7 +597,7 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG2]], %[[CST0]] : tensor // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG2]], %[[CST1]] : tensor // CHECK: %[[OUT:.+]] = tensor.empty(%[[DIM0]], %[[DIM1]]) : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 1] // CHECK-SAME: inner_dims_pos = [1, 0] // CHECK-SAME: inner_tiles = [%[[IDX2]], %[[IDX1]]] @@ -607,9 +607,9 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile // ----- -func.func @tensor_unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<3648x56xf32> { +func.func @linalg.unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<3648x56xf32> { %0 = tensor.empty() : tensor<56x3648xf32> - %pack = tensor.unpack %arg0 + %pack = linalg.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 64] @@ -622,10 +622,10 @@ func.func @tensor_unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> permutation = [1,0] return %transposed : tensor<3648x56xf32> } -// CHECK-LABEL: func.func @tensor_unpack_linalg_transpose_fold( +// CHECK-LABEL: func.func @linalg.unpack_linalg_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) -> tensor<3648x56xf32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<3648x56xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [1, 0] // CHECK-SAME: inner_tiles = [1, 64] @@ -637,7 +637,7 @@ func.func @tensor_unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> func.func @tensor_padded_unpack_linalg_transpose_fold(%arg0: tensor<71x7x4x16x16xf32>) -> tensor<100x71x64xf32> { %0 = tensor.empty() : tensor<71x100x64xf32> - %pack = tensor.unpack %arg0 + %pack = linalg.unpack %arg0 inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %0 : tensor<71x7x4x16x16xf32> -> tensor<71x100x64xf32> @@ -652,7 +652,7 @@ func.func @tensor_padded_unpack_linalg_transpose_fold(%arg0: tensor<71x7x4x16x16 // CHECK-LABEL: func.func @tensor_padded_unpack_linalg_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<71x7x4x16x16xf32>) -> tensor<100x71x64xf32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<100x71x64xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0, 2] // CHECK-SAME: inner_dims_pos = [0, 2] // CHECK-SAME: inner_tiles = [16, 16] @@ -668,7 +668,7 @@ func.func @non_involution_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) - outs(%0 : tensor<5x2x3x16x4xi32>) permutation = [2, 0, 1, 4, 3] %1 = tensor.empty() : tensor<5x48x8xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [0, 2, 1] inner_dims_pos = [1, 2] inner_tiles = [16, 4] into @@ -678,7 +678,7 @@ func.func @non_involution_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) - //CHECK-LABEL: func.func @non_involution_transpose_unpack_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x5x4x16xi32>) -> tensor<5x48x8xi32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<5x48x8xi32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0] // CHECK-SAME: inner_dims_pos = [2, 1] // CHECK-SAME: inner_tiles = [4, 16] @@ -690,7 +690,7 @@ func.func @non_involution_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) - func.func @unpack_non_involution_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { %0 = tensor.empty() : tensor<3x56x3648xf32> - %unpack = tensor.unpack %arg0 + %unpack = linalg.unpack %arg0 outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [1, 64] @@ -706,7 +706,7 @@ func.func @unpack_non_involution_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) // CHECK-LABEL: func.func @unpack_non_involution_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<3648x3x56xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] // CHECK-SAME: inner_dims_pos = [2, 0] // CHECK-SAME: inner_tiles = [1, 64] @@ -722,7 +722,7 @@ func.func @transpose_unpacked_dims_no_fold(%arg0: tensor<2x16x5x4x3xi32>) -> ten outs(%0 : tensor<5x2x3x16x4xi32>) permutation = [2, 0, 4, 1, 3] %1 = tensor.empty() : tensor<5x32x12xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed inner_dims_pos = [1, 2] inner_tiles = [16, 4] into %1 : tensor<5x2x3x16x4xi32> -> tensor<5x32x12xi32> @@ -730,7 +730,7 @@ func.func @transpose_unpacked_dims_no_fold(%arg0: tensor<2x16x5x4x3xi32>) -> ten } //CHECK-LABEL: func.func @transpose_unpacked_dims_no_fold( // CHECK: linalg.transpose -// CHECK: tensor.unpack +// CHECK: linalg.unpack // ----- @@ -747,7 +747,7 @@ func.func @generic_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) -> tenso linalg.yield %in : i32 } -> tensor<5x2x3x16x4xi32> %1 = tensor.empty() : tensor<5x48x8xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [0, 2, 1] inner_dims_pos = [1, 2] inner_tiles = [16, 4] into @@ -757,7 +757,7 @@ func.func @generic_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) -> tenso //CHECK-LABEL: func.func @generic_transpose_unpack_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x5x4x16xi32>) -> tensor<5x48x8xi32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<5x48x8xi32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0] // CHECK-SAME: inner_dims_pos = [2, 1] // CHECK-SAME: inner_tiles = [4, 16] @@ -771,7 +771,7 @@ func.func @generic_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) -> tenso #map1 = affine_map<(d0, d1, d2)->(d0, d1, d2)> func.func @unpack_generic_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { %0 = tensor.empty() : tensor<3x56x3648xf32> - %unpack = tensor.unpack %arg0 + %unpack = linalg.unpack %arg0 outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [1, 64] @@ -791,7 +791,7 @@ func.func @unpack_generic_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) -> ten // CHECK-LABEL: func.func @unpack_generic_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<3648x3x56xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] // CHECK-SAME: inner_dims_pos = [2, 0] // CHECK-SAME: inner_tiles = [1, 64] diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir index 0c6d8f4e05c33..654169841c1c1 100644 --- a/mlir/test/Dialect/Tensor/invalid.mlir +++ b/mlir/test/Dialect/Tensor/invalid.mlir @@ -635,181 +635,6 @@ func.func @empty_wrong_number_of_operands(%sz : index) { // ----- -func.func @pack_invalid_no_padding_no_full_tiles(%input: tensor<256x128xf32>, %output: tensor<8x8x16x33xf32>) -> tensor<8x8x16x33xf32> { - // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} - %0 = tensor.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 33] into %output : tensor<256x128xf32> -> tensor<8x8x16x33xf32> - return %0 : tensor<8x8x16x33xf32> -} - -// ----- - -func.func @pack_invalid_no_padding_no_full_tiles_dyn_tiles(%input: tensor<256x128xf32>, %output: tensor<10x8x?x?xf32>, %tile_size_0: index, %tile_size_1: index) -> tensor<10x8x?x?xf32> { - // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} - %0 = tensor.pack %input inner_dims_pos = [1, 0] inner_tiles = [%tile_size_0, %tile_size_1] into %output : tensor<256x128xf32> -> tensor<10x8x?x?xf32> - return %0 : tensor<10x8x?x?xf32> -} - -// ----- - -func.func @pack_invalid_no_padding_no_full_tiles_dyn_tiles_outperm(%input: tensor<256x128xf32>, %output: tensor<8x10x?x?xf32>, %tile_size_0: index, %tile_size_1: index) -> tensor<8x10x?x?xf32> { - // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} - %0 = tensor.pack %input outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [%tile_size_0, %tile_size_1] into %output : tensor<256x128xf32> -> tensor<8x10x?x?xf32> - return %0 : tensor<8x10x?x?xf32> -} - -// ----- - -func.func @pad_and_pack_invalid_type(%input: tensor<13x15xf32>, %output: tensor<2x8x8x2xf32>, %pad: i32) -> tensor<2x8x8x2xf32> { - // expected-error@+1 {{expected padding_value has 'f32' but got: 'i32'}} - %0 = tensor.pack %input padding_value(%pad: i32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<13x15xf32> -> tensor<2x8x8x2xf32> - return %0 : tensor<2x8x8x2xf32> -} - -// ----- - -func.func @pack_invalid_inner_dims_pos_vector(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid inner_dims_pos vector}} - %0 = tensor.pack %input inner_dims_pos = [2, 0] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- - -func.func @pack_invalid_duplicate_element_in_inner_dims(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid inner_dims_pos vector}} - %0 = tensor.pack %input inner_dims_pos = [1, 1] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- - -func.func @pack_invalid_duplicate_element_in_outer_perm(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid outer_dims_perm vector}} - %0 = tensor.pack %input outer_dims_perm = [1, 1] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- - -func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<64x32x16xf32> { - // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} - %0 = tensor.pack %input inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %output : tensor<256x128xf32> -> tensor<64x32x16xf32> - return %0 : tensor<64x32x16xf32> -} - -// ----- - -func.func @unpack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> { - // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} - %0 = tensor.unpack %output inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %input : tensor<64x32x16xf32> -> tensor<256x128xf32> - return %0 : tensor<256x128xf32> -} - -// ----- - -func.func @unpack_invalid_out_of_bound_outer_perm(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid outer_dims_perm vector}} - %0 = tensor.unpack %output outer_dims_perm = [2, 1] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %input : tensor<8x8x32x16xf32> -> tensor<256x128xf32> - return %0 : tensor<256x128xf32> -} - -// ----- - -func.func @pack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<16x4x32x16xf32> { - // expected-error@+1 {{outer_dims_perm must be a permutation or empty}} - %0 = tensor.pack %source outer_dims_perm = [0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<16x4x32x16xf32> - return %0 : tensor<16x4x32x16xf32> -} - -// ----- - -func.func @unpack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<128x256xf32> { - // expected-error@+1 {{outer_dims_perm must be a permutation or empty}} - %0 = tensor.unpack %dest outer_dims_perm = [1] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<16x4x32x16xf32> -> tensor<128x256xf32> - return %0 : tensor<128x256xf32> -} - -// ----- - -func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x8x16x32xf32>', got 'tensor<8x8x32x16xf32>'}} - %0 = tensor.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- - -func.func @unpack_invalid(%output: tensor<256x128xf32>, %input: tensor<8x8x32x16xf32>) -> tensor<256x128xf32> { - // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x32x4x32xf32>', got 'tensor<8x8x32x16xf32>'}} - %0 = tensor.unpack %input inner_dims_pos = [1, 0] inner_tiles = [4, 32] into %output : tensor<8x8x32x16xf32> -> tensor<256x128xf32> - return %0 : tensor<256x128xf32> -} - -// ----- - -func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid zero tile factor}} - %0 = tensor.pack %input inner_dims_pos = [1, 0] inner_tiles = [0, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- -func.func @pack_mismatch_inner_tile_size_and_output_shape( - %input : tensor, %output : tensor) -> tensor { - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @pack_dynamic_inner_tile_size_and_static_output_shape( - %input : tensor, %output : tensor) -> tensor { - %c8 = arith.constant 8 : index - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, %c8] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @pack_static_inner_tile_size_and_dynamic_output_shape( - %input : tensor, %output : tensor) -> tensor { - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @unpack_mismatch_inner_tile_size_and_output_shape( - %input : tensor, %output : tensor) -> tensor { - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @unpack_dynamic_inner_tile_size_and_static_output_shape( - %input : tensor, %output : tensor) -> tensor { - %c8 = arith.constant 8 : index - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%c8, 4] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @unpack_static_inner_tile_size_and_dynamic_output_shape( - %input : tensor, %output : tensor) -> tensor { - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - func.func @bitcast_index_0(%arg0 : tensor) -> tensor { // expected-error @+1 {{'tensor.bitcast' op result #0 must be tensor of signless integer or unsigned integer or signed integer or floating-point values, but got 'tensor'}} %0 = tensor.bitcast %arg0 : tensor to tensor diff --git a/mlir/test/Dialect/Tensor/ops.mlir b/mlir/test/Dialect/Tensor/ops.mlir index 378137a14b59f..930986211cb6d 100644 --- a/mlir/test/Dialect/Tensor/ops.mlir +++ b/mlir/test/Dialect/Tensor/ops.mlir @@ -358,106 +358,3 @@ func.func @gather_scatter( (tensor<1x3x4xf32>, tensor<4x5x6xf32>, tensor<1x3x2xi32>) -> tensor<4x5x6xf32> return } - -// ----- - -func.func @pack_nc_to_ncnc(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) -> tensor<128x256xf32> { - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> - %1 = tensor.empty() : tensor<128x256xf32> - %2 = tensor.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<4x16x32x16xf32> -> tensor<128x256xf32> - return %2 : tensor<128x256xf32> -} - -// CHECK-LABEL: func.func @pack_nc_to_ncnc( -// CHECK-SAME: %[[SOURCE:.*]]: tensor<128x256xf32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<4x16x32x16xf32>) -// CHECK: %[[PACKED:.*]] = tensor.pack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[DEST]] : tensor<128x256xf32> -> tensor<4x16x32x16xf32> -// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<128x256xf32> -// CHECK: %{{.*}} = tensor.unpack %[[PACKED]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[BUFF]] : tensor<4x16x32x16xf32> -> tensor<128x256xf32> - -// ----- - -func.func @pack_nc_to_ncnc_with_padding(%source: tensor<13x15xf32>, %dest: tensor<2x8x8x2xf32>, %padding: f32) -> tensor<13x15xf32> { - %0 = tensor.pack %source padding_value(%padding : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor<13x15xf32> -> tensor<2x8x8x2xf32> - %1 = tensor.empty() : tensor<13x15xf32> - %2 = tensor.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %1 : tensor<2x8x8x2xf32> -> tensor<13x15xf32> - return %2 : tensor<13x15xf32> -} - -// CHECK-LABEL: func.func @pack_nc_to_ncnc_with_padding( -// CHECK-SAME: %[[SOURCE:.*]]: tensor<13x15xf32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<2x8x8x2xf32>, -// CHECK-SAME: %[[PADDING:.*]]: f32) -// CHECK: %[[PACKED:.*]] = tensor.pack %[[SOURCE]] padding_value(%[[PADDING]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor<13x15xf32> -> tensor<2x8x8x2xf32> -// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<13x15xf32> -// CHECK: %{{.*}} = tensor.unpack %[[PACKED]] inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[BUFF]] : tensor<2x8x8x2xf32> -> tensor<13x15xf32> - -// ----- - -func.func @pack_ck_to_kcck(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<128x256xf32> { - %0 = tensor.pack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<16x4x32x16xf32> - %1 = tensor.empty() : tensor<128x256xf32> - %2 = tensor.unpack %0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<16x4x32x16xf32> -> tensor<128x256xf32> - return %2 : tensor<128x256xf32> -} - -// CHECK-LABEL: func.func @pack_ck_to_kcck( -// CHECK-SAME: %[[SOURCE:.*]]: tensor<128x256xf32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<16x4x32x16xf32>) -// CHECK: %[[PACKED:.*]] = tensor.pack %[[SOURCE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[DEST]] : tensor<128x256xf32> -> tensor<16x4x32x16xf32> -// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<128x256xf32> -// CHECK: %{{.*}} = tensor.unpack %[[PACKED]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[BUFF]] : tensor<16x4x32x16xf32> -> tensor<128x256xf32> - -// ----- - -func.func @pad_and_pack_fully_dynamic(%source: tensor, %dest: tensor, %pad: f32, %tile_n : index, %tile_m : index) -> tensor { - %0 = tensor.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor - return %0 : tensor -} - -// CHECK-LABEL: func.func @pad_and_pack_fully_dynamic( -// CHECK-SAME: %[[SOURCE:.*]]: tensor, -// CHECK-SAME: %[[DEST:.*]]: tensor, -// CHECK-SAME: %[[PAD:.*]]: f32, -// CHECK-SAME: %[[TILE_N:.*]]: index, -// CHECK-SAME: %[[TILE_M:.*]]: index) -// CHECK: %{{.*}} = tensor.pack %[[SOURCE]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_N]], %[[TILE_M]]] into %[[DEST]] : tensor -> tensor - -// ----- - -func.func @pad_and_pack_partially_dynamic(%source: tensor, %dest: tensor, %pad: f32) -> tensor { - %0 = tensor.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor - return %0 : tensor -} - -// CHECK-LABEL: func.func @pad_and_pack_partially_dynamic( -// CHECK-SAME: %[[SOURCE:.*]]: tensor, -// CHECK-SAME: %[[DEST:.*]]: tensor, -// CHECK-SAME: %[[PAD:.*]]: f32) -// CHECK: %{{.*}} = tensor.pack %[[SOURCE]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor -> tensor - -// ----- - -func.func @unpack_fully_dynamic(%source: tensor, %dest: tensor, %tile_n : index, %tile_m : index) -> tensor { - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor - return %0 : tensor -} - -// CHECK-LABEL: func.func @unpack_fully_dynamic( -// CHECK-SAME: %[[SOURCE:.*]]: tensor, -// CHECK-SAME: %[[DEST:.*]]: tensor, -// CHECK-SAME: %[[TILE_N:.*]]: index, -// CHECK-SAME: %[[TILE_M:.*]]: index) -// CHECK: %{{.*}} = tensor.unpack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_N]], %[[TILE_M]]] into %[[DEST]] : tensor -> tensor - -// ----- - -func.func @unpack_partially_dynamic(%source: tensor, %dest: tensor) -> tensor { - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor - return %0: tensor -} - -// CHECK-LABEL: func.func @unpack_partially_dynamic( -// CHECK-SAME: %[[SOURCE:.*]]: tensor, -// CHECK-SAME: %[[DEST:.*]]: tensor) -// CHECK: %{{.*}} = tensor.unpack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor -> tensor diff --git a/mlir/test/Dialect/Tensor/tiling.mlir b/mlir/test/Dialect/Tensor/tiling.mlir index 193fbe93e0f9e..04a99b5fd0d68 100644 --- a/mlir/test/Dialect/Tensor/tiling.mlir +++ b/mlir/test/Dialect/Tensor/tiling.mlir @@ -224,495 +224,3 @@ module attributes {transform.with_named_sequence} { transform.yield } } - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 32)> -// CHECK: func.func @NC_to_NCnc -// CHECK-SAME: %[[IN:.*]]: tensor<128x256xf32>, -// CHECK-SAME: %[[OUT:.*]]: tensor<4x8x32x32xf32>) -> tensor<4x8x32x32xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[RES0:.*]] = scf.for %[[N:.*]] = %[[C0]] to %[[C4]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor<4x8x32x32xf32>) { -// CHECK: %[[RES1:.+]] = scf.for %[[C:.*]] = %[[C0]] to %[[C8]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor<4x8x32x32xf32>) { -// CHECK-DAG: %[[IN_N:.+]] = affine.apply #[[MAP0]](%[[N]]) -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP0]](%[[C]]) -// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_N]], %[[IN_C]]] [64, 128] [1, 1] : tensor<128x256xf32> to tensor<64x128xf32> -// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[N]], %[[C]], 0, 0] [2, 4, 32, 32] [1, 1, 1, 1] : tensor<4x8x32x32xf32> to tensor<2x4x32x32xf32> -// CHECK: %[[SUB_RES:.*]] = tensor.pack -// CHECK-SAME: %[[SUB_IN]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[SUB_OUT]] -// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] -// CHECK: scf.yield %[[INSERT]] : tensor<4x8x32x32xf32> -// CHECK: } -// CHECK: scf.yield %[[RES1:.*]] : tensor<4x8x32x32xf32> -// CHECK: } -// CHECK: return %[[RES0:.*]] : tensor<4x8x32x32xf32> -// CHECK: } -func.func @NC_to_NCnc(%arg0: tensor<128x256xf32>, %arg1: tensor<4x8x32x32xf32>) -> tensor<4x8x32x32xf32> { - %0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg1 : tensor<128x256xf32> -> tensor<4x8x32x32xf32> - return %0 : tensor<4x8x32x32xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 8)> -// CHECK: func.func @KC_to_CKkc -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK: scf.for %[[C:.+]] = %[[C0]] to %[[C32]] step %[[C2]] -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP0]](%[[C]]) -// CHECK: %[[INPUT_SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK-SAME: [0, %[[IN_C]]] [128, 16] -// CHECK: %[[OUTPUT_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[C]], 0, 0, 0] [2, 4, 32, 8] -// CHECK: tensor.pack -// CHECK-SAME: %[[INPUT_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] -// CHECK-SAME: into %[[OUTPUT_SLICE]] -func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) -> tensor<32x4x32x8xf32> { - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<128x256xf32> -> tensor<32x4x32x8xf32> - return %0 : tensor<32x4x32x8xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * -2 + 15, 8)> -// CHECK: func.func @pad_and_pack_static( -// CHECK-SAME: %[[IN:.*]]: tensor<13x15xf32>, -// CHECK-SAME: %[[OUT:.*]]: tensor<2x8x8x2xf32>, -// CHECK-SAME: %[[PAD:.*]]: f32) -> tensor<2x8x8x2xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index -// CHECK-DAG: %[[RES0:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[C8]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[OUT]]) -> (tensor<2x8x8x2xf32>) { -// CHECK-DAG: %[[IN_J:.*]] = affine.apply #[[MAP0]](%[[J]]) -// CHECK-DAG: %[[IN_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]]) -// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][0, %[[IN_J]]] [13, %[[IN_J_SZ]]] [1, 1] -// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][0, %[[J]], 0, 0] [2, 4, 8, 2] [1, 1, 1, 1] -// CHECK: %[[SUB_RES:.*]] = tensor.pack -// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] -// CHECK-SAME: into %[[SUB_OUT]] -// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] -// CHECK: scf.yield %[[INSERT]] : tensor<2x8x8x2xf32> -// CHECK: } -// CHECK: return %[[RES0:.*]] : tensor<2x8x8x2xf32> -// CHECK: } -func.func @pad_and_pack_static(%input: tensor<13x15xf32>, %output: tensor<2x8x8x2xf32>, %pad: f32) -> tensor<2x8x8x2xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<13x15xf32> -> tensor<2x8x8x2xf32> - return %0 : tensor<2x8x8x2xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 * 8)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -8 + s0, d0 * 8)> -// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)> -// CHECK: func.func @pad_and_pack_partially_dynamic( -// CHECK-SAME: %[[IN:.*]]: tensor, -// CHECK-SAME: %[[OUT:.*]]: tensor, -// CHECK-SAME: %[[PAD:.*]]: f32) -> tensor { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[OUT_D0:.*]] = tensor.dim %[[OUT]], %[[C0]] : tensor -// CHECK-DAG: %[[OUT_D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor -// CHECK: %[[RES0:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[OUT_D0]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor) { -// CHECK: %[[RES1:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[OUT_D1]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor) { -// CHECK-DAG: %[[OUT_I_SZ:.*]] = affine.min #[[MAP0]](%[[I]])[%[[OUT_D0]]] -// CHECK-DAG: %[[OUT_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]])[%[[OUT_D1]]] -// CHECK-DAG: %[[IN_I:.*]] = affine.apply #[[MAP2]](%[[I]]) -// CHECK-DAG: %[[IN_I_SZ:.*]] = affine.min #[[MAP3]] -// CHECK-DAG: %[[IN_J:.*]] = affine.apply #[[MAP4]](%[[J]]) -// CHECK-DAG: %[[IN_J_SZ:.*]] = affine.min #[[MAP5]] -// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_I]], %[[IN_J]]] [%[[IN_I_SZ]], %[[IN_J_SZ]]] [1, 1] : tensor to tensor -// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[I]], %[[J]], 0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]], 8, 2] [1, 1, 1, 1] : tensor to tensor -// CHECK: %[[SUB_RES:.*]] = tensor.pack -// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] -// CHECK-SAME: into %[[SUB_OUT]] -// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] -// CHECK: scf.yield %[[INSERT]] : tensor -// CHECK: } -// CHECK: scf.yield %[[RES1:.*]] : tensor -// CHECK: } -// CHECK: return %[[VAL_34:.*]] : tensor -// CHECK: } -func.func @pad_and_pack_partially_dynamic(%input: tensor, %output: tensor, %pad: f32) -> tensor { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor -> tensor - return %0 : tensor -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s0, -(d1 * s0) + s1)> -// CHECK: func.func @pad_and_pack_fully_dynamic( -// CHECK-SAME: %[[IN:.*]]: tensor, -// CHECK-SAME: %[[OUT:.*]]: tensor, -// CHECK-SAME: %[[PAD:.*]]: f32, -// CHECK-SAME: %[[TILE_0:.*]]: index, -// CHECK-SAME: %[[TILE_1:.*]]: index) -> tensor { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[OUT_D0:.*]] = tensor.dim %[[OUT]], %[[C0]] : tensor -// CHECK-DAG: %[[OUT_D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor -// CHECK: %[[RES0:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[OUT_D0]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor) { -// CHECK: %[[RES1:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[OUT_D1]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor) { -// CHECK-DAG: %[[OUT_I_SZ:.*]] = affine.min #[[MAP0]](%[[I]])[%[[OUT_D0]]] -// CHECK-DAG: %[[OUT_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]])[%[[OUT_D1]]] -// CHECK-DAG: %[[IN_D0:.*]] = tensor.dim %[[IN]], %[[C0]] -// CHECK-DAG: %[[IN_D1:.*]] = tensor.dim %[[IN]], %[[C1]] -// CHECK: %[[IN_I:.*]] = affine.apply #[[MAP2]](%[[I]])[%[[TILE_0]]] -// CHECK: %[[IN_I_SZ:.*]] = affine.min #[[MAP3]](%[[OUT_I_SZ]], %[[I]])[%[[TILE_0]], %[[IN_D0]]] -// CHECK: %[[IN_J:.*]] = affine.apply #[[MAP2]](%[[J]])[%[[TILE_1]]] -// CHECK: %[[IN_J_SZ:.*]] = affine.min #[[MAP3]](%[[OUT_J_SZ]], %[[J]])[%[[TILE_1]], %[[IN_D1]]] -// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_I]], %[[IN_J]]] [%[[IN_I_SZ]], %[[IN_J_SZ]]] [1, 1] : tensor to tensor -// CHECK: %[[OUT_D2:.+]] = tensor.dim %[[ITER1]], %[[C2]] -// CHECK: %[[OUT_D3:.+]] = tensor.dim %[[ITER1]], %[[C3]] -// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[I]], %[[J]], 0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]], %[[OUT_D2]], %[[OUT_D3]]] [1, 1, 1, 1] : tensor to tensor -// CHECK: %[[PACK:.*]] = tensor.pack -// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_0]], %[[TILE_1]]] -// CHECK-SAME: into %[[SUB_OUT]] -// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[PACK]] into %[[ITER1]] -// CHECK: scf.yield %[[INSERT]] : tensor -// CHECK: } -// CHECK: scf.yield %[[RES1:.*]] : tensor -// CHECK: } -// CHECK: return %[[RES0:.*]] : tensor -// CHECK: } -func.func @pad_and_pack_fully_dynamic(%source: tensor, %dest: tensor, %pad: f32, %tile_n : index, %tile_m : index) -> tensor { - %0 = tensor.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor - return %0 : tensor -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)> -// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 16)> -// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 16)> -// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 16 - d0 floordiv 16 + 1)> -// CHECK: func.func @NCnc_to_NC -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index -// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index -// CHECK: %{{.+}} = scf.for %[[I:.+]] = %[[C0]] to %[[C256]] step %[[C2]] -// CHECK: %{{.+}} = scf.for %[[J:.+]] = %[[C0]] to %[[C128]] step %[[C4]] -// CHECK-DAG: %[[IN_I:.+]] = affine.apply #[[MAP0]](%[[I]]) -// CHECK-DAG: %[[OFFSET_I:.+]] = affine.apply #[[MAP1]](%[[I]]) -// CHECK-DAG: %[[IN_I_SZ:.+]] = affine.apply #[[MAP2]](%[[I]]) -// CHECK-DAG: %[[IN_J:.+]] = affine.apply #[[MAP4]](%[[J]]) -// CHECK-DAG: %[[OFFSET_J:.+]] = affine.apply #[[MAP5]](%[[J]]) -// CHECK-DAG: %[[IN_J_SZ:.+]] = affine.apply #[[MAP6]](%[[J]]) -// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK-SAME: [%[[IN_I]], %[[IN_J]], 0, 0] [%[[IN_I_SZ]], %[[IN_J_SZ]], 32, 16] -// CHECK-SAME: : tensor<8x8x32x16xf32> to tensor -// CHECK: %[[EMPTY:.+]] = tensor.empty -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[SLICE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] -// CHECK-SAME: into %[[EMPTY]] -// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] -// CHECK-SAME: [%[[OFFSET_I]], %[[OFFSET_J]]] [2, 4] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]] -// CHECK-SAME: into %{{.+}}[%[[I]], %[[J]]] [2, 4] -// CHECK: scf.yield %[[RES]] -func.func @NCnc_to_NC(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> - return %0 : tensor<256x128xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)> -// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 8)> -// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 8)> -// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 8 - d0 floordiv 8 + 1)> -// CHECK: func.func @CKkc_to_KC -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index -// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index -// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C128]] step %[[C2]] -// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C256]] step %[[C4]] -// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]]) -// CHECK-DAG: %[[OFFSET_K:.+]] = affine.apply #[[MAP1]](%[[K]]) -// CHECK-DAG: %[[IN_K_SZ:.+]] = affine.apply #[[MAP2]](%[[K]]) -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP4]](%[[C]]) -// CHECK-DAG: %[[OFFSET_C:.+]] = affine.apply #[[MAP5]](%[[C]]) -// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP6]](%[[C]]) -// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], %[[IN_K_SZ]], 32, 8] -// CHECK: %[[EMPTY:.+]] = tensor.empty -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] -// CHECK-SAME: into %[[EMPTY]] -// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] -// CHECK-SAME: [%[[OFFSET_K]], %[[OFFSET_C]]] [2, 4] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]] -// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4] -// CHECK: scf.yield %[[RES]] -func.func @CKkc_to_KC(%source: tensor<32x4x32x8xf32>, %dest: tensor<128x256xf32>) -> tensor<128x256xf32> { - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %dest : tensor<32x4x32x8xf32> -> tensor<128x256xf32> - return %0 : tensor<128x256xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 floordiv 4)> -// CHECK: func.func @perfect_CKkc_to_KC -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index -// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index -// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C2]] -// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C128]] step %[[C4]] -// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]]) -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP1]](%[[C]]) -// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [1, 1, 2, 4] -// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [2, 4] -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] -// CHECK-SAME: into %[[ITER_SLICE]] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] -// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4] -// CHECK: scf.yield %[[RES]] -func.func @perfect_CKkc_to_KC(%source: tensor<32x4x2x4xf32>, %dest: tensor<8x128xf32>) -> tensor<8x128xf32> { - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %dest : tensor<32x4x2x4xf32> -> tensor<8x128xf32> - return %0 : tensor<8x128xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 2)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 2)> -// CHECK: func.func @dynamic_perfect_CKkc_to_KC -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[DIM_0:.+]] = tensor.dim %[[OUT]], %[[C0]] -// CHECK-DAG: %[[DIM_1:.+]] = tensor.dim %[[OUT]], %[[C1]] -// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[DIM_0]] step %[[C2]] -// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[DIM_1]] step %[[C4]] -// CHECK-DAG: %[[OUT_K_SZ:.+]] = affine.min #[[MAP0]](%[[K]])[%[[DIM_0]]] -// CHECK-DAG: %[[OUT_C_SZ:.+]] = affine.min #[[MAP1]](%[[C]])[%[[DIM_1]]] -// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP2]](%[[K]]) -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP2]](%[[C]]) -// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP3]](%[[OUT_C_SZ]]) -// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], 1, 2, 2] -// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] -// CHECK-SAME: into %[[ITER_SLICE]] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] -// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]] -// CHECK: scf.yield %[[RES]] - -func.func @dynamic_perfect_CKkc_to_KC(%source: tensor, %dest: tensor) -> tensor { - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %dest : tensor -> tensor - return %0 : tensor -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK: #[[MAP:.+]] = affine_map<(d0) -> (d0 floordiv 2)> -// CHECK: func.func @perfect_NKPQk_to_NPQK( -// CHECK-SAME: %[[SOURCE:.+]]: tensor<1x4x6x6x2xf32>, -// CHECK-SAME: %{{.+}}: tensor<1x6x6x8xf32>) -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK: %{{.+}} = scf.for %[[P:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %{{.+}} = scf.for %[[Q:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C4]] -// CHECK: %[[K_SZ:.+]] = affine.apply #[[MAP]](%[[K]]) -// CHECK: %[[SLICE_SOURCE:.+]] = tensor.extract_slice %[[SOURCE]][0, %[[K_SZ]], %[[P]], %[[Q]], 0] -// CHECK: %[[SLICE_DEST:.+]] = tensor.extract_slice %{{.+}}[0, %[[P]], %[[Q]], %[[K]]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[SLICE_SOURCE]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] -// CHECK-SAME: into %[[SLICE_DEST]] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] -// CHECK-SAME: into %{{.+}}[0, %[[P]], %[[Q]], %[[K]]] -// CHECK: scf.yield %[[RES]] - -func.func @perfect_NKPQk_to_NPQK(%source: tensor<1x4x6x6x2xf32>, %dest: tensor<1x6x6x8xf32>) -> tensor<1x6x6x8xf32> { - %0 = tensor.unpack %source outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %dest : tensor<1x4x6x6x2xf32> -> tensor<1x6x6x8xf32> - return %0 : tensor<1x6x6x8xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -func.func private @get_dynamic_tile_size() -> index - -// CHECK-LABEL: func.func @fully_dynamic_unpack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DST:[0-9a-zA-Z]+]] -// CHECK: %[[INNER_TS:.+]] = call @get_dynamic_tile_size() : () -> index -// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[DST]]) -// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]] -// CHECK: %[[EMPTY:.+]] = tensor.empty -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[SLICE]] -// CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [%[[INNER_TS]], %[[INNER_TS]]] into %[[EMPTY]] -func.func @fully_dynamic_unpack(%source: tensor, %dest: tensor) -> tensor { - %0 = func.call @get_dynamic_tile_size() : () -> index - %1 = tensor.unpack %source inner_dims_pos = [1, 0] inner_tiles = [%0, %0] into %dest : tensor -> tensor - return %1 : tensor -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [4, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK: func.func @perfect_NPQK_to_NKPQk -// CHECK-SAME: %[[SOURCE:.+]]: tensor<1x6x6x8xf32>, -// CHECK-SAME: %{{.+}}: tensor<1x4x6x6x2xf32>) -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK: %{{.+}} = scf.for %[[ARG2:.+]] = %[[C0]] to %[[C4]] step %[[C1]] -// CHECK: %{{.+}} = scf.for %[[ARG4:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %{{.+}} = scf.for %[[ARG6:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %[[APPLY:.+]] = affine.apply #[[MAP1]](%[[ARG2]]) -// CHECK: %[[SLICE_SOURCE:.+]] = tensor.extract_slice %[[SOURCE]][0, %[[ARG4]], %[[ARG6]], %[[APPLY]]] -// CHECK: %[[SLICE_DEST:.+]] = tensor.extract_slice %{{.+}}[0, %[[ARG2]], %[[ARG4]], %[[ARG6]], 0] -// CHECK: %[[PACK:.+]] = tensor.pack -// CHECK-SAME: %[[SLICE_SOURCE]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] -// CHECK-SAME: into %[[SLICE_DEST]] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[PACK]] -// CHECK-SAME: into %{{.+}}[0, %[[ARG2]], %[[ARG4]], %[[ARG6]], 0] -// CHECK: scf.yield %[[RES]] - -func.func @perfect_NPQK_to_NKPQk(%source: tensor<1x6x6x8xf32>, %dest: tensor<1x4x6x6x2xf32>) -> tensor<1x4x6x6x2xf32> { - %0 = tensor.pack %source outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %dest : tensor<1x6x6x8xf32> -> tensor<1x4x6x6x2xf32> - return %0 : tensor<1x4x6x6x2xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir index a0fd3f7d87083..bca94d4a64416 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir @@ -22,7 +22,7 @@ // RUN: rm -f %t && %{compile} && %{run} | FileCheck %s -/// End-to-end test for tensor.pack where one of the inner tile sizes is +/// End-to-end test for linalg.pack where one of the inner tile sizes is /// scalable. func.func @main() { @@ -60,7 +60,7 @@ func.func private @pack(%A: tensor<7x16xi32>) { %A_pack_empty = tensor.empty(%c1, %tile_size) : tensor - %A_pack = tensor.pack %A + %A_pack = linalg.pack %A padding_value(%pad_val : i32) inner_dims_pos = [0, 1] inner_tiles = [%tile_size, 1] @@ -117,9 +117,9 @@ func.func private @pack(%A: tensor<7x16xi32>) { module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op - // 1. Tile so that we can decompose tensor.pack into tensor.pad and other + // 1. Tile so that we can decompose linalg.pack into tensor.pad and other // Ops (see step 2) %tiled_pack_op_p, %loops:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir index 15edae8b6d3f8..a8daa0b855d00 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir @@ -8,7 +8,7 @@ // RUN: rm -f %t && %{compile} && %{run} | FileCheck %s -/// End-to-end test for tensor.pack where one of the inner tile sizes is +/// End-to-end test for linalg.pack where one of the inner tile sizes is /// dynamic. func.func @main() { @@ -38,7 +38,7 @@ func.func private @pack(%A: tensor<7x16xi32>) { %tile_size = arith.constant 8 : index %A_pack_empty = tensor.empty(%c1, %tile_size) : tensor - %A_pack = tensor.pack %A + %A_pack = linalg.pack %A padding_value(%pad_val : i32) inner_dims_pos = [0, 1] inner_tiles = [%tile_size, 1] @@ -78,9 +78,9 @@ func.func private @pack(%A: tensor<7x16xi32>) { module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op - // 1. Tile so that we can decompose tensor.pack into tensor.pad and other + // 1. Tile so that we can decompose linalg.pack into tensor.pad and other // Ops (see step 2) %tiled_pack_op_p, %loops:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir index 63622d761bc5b..05e678227de32 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir @@ -12,9 +12,9 @@ /// End-to-end test for computing matrix-multiplication using linalg.mmt4d. In /// particular, demonstrates how the following MLIR sequence (implemented in @mmt4d): /// -/// A_pack = tensor.pack A -/// B_pack = tensor.pack B -/// C_pack = tensor.pack C +/// A_pack = linalg.pack A +/// B_pack = linalg.pack B +/// C_pack = linalg.pack C /// out_pack = linalg.mmt4d(A_pack, B_pack, C_pack) /// /// is equivalent to: @@ -86,16 +86,16 @@ func.func private @mmt4d(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor %C_pack_empty = tensor.empty() : tensor<2x2x8x8xi32> // Pack matrices - %A_pack = tensor.pack %A padding_value(%zero : i32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %A_pack_empty : tensor<7x16xi32> -> tensor<2x16x8x1xi32> - %B_pack = tensor.pack %B padding_value(%zero : i32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 1] into %B_pack_empty : tensor<16x13xi32> -> tensor<2x16x8x1xi32> - %C_pack = tensor.pack %C padding_value(%zero : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_pack_empty : tensor<7x13xi32> -> tensor<2x2x8x8xi32> + %A_pack = linalg.pack %A padding_value(%zero : i32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %A_pack_empty : tensor<7x16xi32> -> tensor<2x16x8x1xi32> + %B_pack = linalg.pack %B padding_value(%zero : i32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 1] into %B_pack_empty : tensor<16x13xi32> -> tensor<2x16x8x1xi32> + %C_pack = linalg.pack %C padding_value(%zero : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_pack_empty : tensor<7x13xi32> -> tensor<2x2x8x8xi32> // MMT4D %mmt4d = linalg.mmt4d ins(%A_pack, %B_pack : tensor<2x16x8x1xi32>, tensor<2x16x8x1xi32>) outs(%C_pack : tensor<2x2x8x8xi32>) -> tensor<2x2x8x8xi32> // Unpack output %C_out_empty = tensor.empty() : tensor<7x13xi32> - %C_out_unpack = tensor.unpack %mmt4d outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_out_empty : tensor<2x2x8x8xi32> -> tensor<7x13xi32> + %C_out_unpack = linalg.unpack %mmt4d outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_out_empty : tensor<2x2x8x8xi32> -> tensor<7x13xi32> return %C_out_unpack : tensor<7x13xi32> } @@ -146,16 +146,16 @@ module @transforms attributes { transform.with_named_sequence } { transform.apply_patterns.canonicalization } : !transform.op<"func.func"> - // Step 4. Lower tensor.pack - %pack = transform.structured.match ops{["tensor.pack"]} in %func_h - : (!transform.op<"func.func">) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + // Step 4. Lower linalg.pack + %pack = transform.structured.match ops{["linalg.pack"]} in %func_h + : (!transform.op<"func.func">) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) - // Step 5. Lower tensor.unpack - %unpack = transform.structured.match ops{["tensor.unpack"]} in %func_h - : (!transform.op<"func.func">) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + // Step 5. Lower linalg.unpack + %unpack = transform.structured.match ops{["linalg.unpack"]} in %func_h + : (!transform.op<"func.func">) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/unpack-dynamic-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/unpack-dynamic-inner-tile.mlir index 4395dfe74914e..c5360ee1ec954 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/unpack-dynamic-inner-tile.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/unpack-dynamic-inner-tile.mlir @@ -8,7 +8,7 @@ // RUN: rm -f %t && %{compile} && %{run} | FileCheck %s -/// End-to-end test for tensor.unpack where one of the inner tile sizes is +/// End-to-end test for linalg.unpack where one of the inner tile sizes is /// dynamic. func.func @main() { @@ -56,7 +56,7 @@ func.func private @unpack(%A: tensor) { %tile_size = arith.constant 8 : index %A_unpack_empty = tensor.empty() : tensor<7x3xi32> - %A_unpack = tensor.unpack %A + %A_unpack = linalg.unpack %A inner_dims_pos = [0, 1] inner_tiles = [%tile_size, 1] into %A_unpack_empty : tensor -> tensor<7x3xi32> @@ -78,9 +78,9 @@ func.func private @unpack(%A: tensor) { module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) { - %pack = transform.structured.match ops{["tensor.unpack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.unpack"]} in %module : (!transform.any_op) -> !transform.any_op - // 1. Tile so that we can decompose tensor.pack + // 1. Tile so that we can decompose linalg.pack // Ops (see step 2) %c8 = transform.param.constant 8 : i64 -> !transform.param %tiled_pack_op_p, %loops:2 = transform.structured.tile_using_for %pack tile_sizes [%c8, 1] diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index 2d35be403ef99..8ce05d94c4ad0 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -211,7 +211,7 @@ module { linalg.yield %7, %8 : f32, f32 } -> (tensor<64x64xf32>, tensor<64x64xf32>) %5 = tensor.empty() : tensor<2048xf32> - %unpack = tensor.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32> + %unpack = linalg.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32> return %4#1, %unpack : tensor<64x64xf32>, tensor<2048xf32> } } @@ -254,7 +254,7 @@ module attributes {transform.with_named_sequence} { // CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#1 into %[[ELEM_OUT_ARG_1]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } -// CHECK: %[[UNPACK:.*]] = tensor.unpack %[[FINAL_RESULT]]#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %{{.*}} : tensor<64x32xf32> -> tensor<2048xf32> +// CHECK: %[[UNPACK:.*]] = linalg.unpack %[[FINAL_RESULT]]#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %{{.*}} : tensor<64x32xf32> -> tensor<2048xf32> // CHECK: return %[[FINAL_RESULT]]#3, %[[UNPACK]] : // ----- @@ -278,7 +278,7 @@ module { } } %output = tensor.empty() : tensor<2048xf32> - %unpack = tensor.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32> + %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32> return %unpack : tensor<2048xf32> } } @@ -308,7 +308,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[UNPACK_RESULT_OFFSET:.*]] = affine.apply #[[UNPACK_RESULT_OFFSET_MAP]](%[[IV1]]) // CHECK-DAG: %[[UNPACK_RESULT_SIZE:.*]] = affine.min #[[UNPACK_RESULT_SIZE_MAP]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [%[[UNPACK_RESULT_SIZE]]] [1] -// CHECK: %[[TILED_UNPACK_OUT:.*]] = tensor.unpack %[[GENERIC_OUT]] +// CHECK: %[[TILED_UNPACK_OUT:.*]] = linalg.unpack %[[GENERIC_OUT]] // CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { @@ -339,7 +339,7 @@ module { } } %output = tensor.empty() : tensor<2047xf32> - %unpack = tensor.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32> + %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32> return %unpack : tensor<2047xf32> } } @@ -369,7 +369,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[UNPACK_RESULT_OFFSET:.*]] = affine.apply #[[UNPACK_RESULT_OFFSET_MAP]](%[[IV1]]) // CHECK-DAG: %[[UNPACK_RESULT_SIZE:.*]] = affine.min #[[UNPACK_RESULT_SIZE_MAP]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [%[[UNPACK_RESULT_SIZE]]] [1] -// CHECK: %[[TILED_UNPACK_OUT:.*]] = tensor.unpack %[[GENERIC_OUT]] +// CHECK: %[[TILED_UNPACK_OUT:.*]] = linalg.unpack %[[GENERIC_OUT]] // CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { @@ -400,7 +400,7 @@ module { } } %output = tensor.empty() : tensor<4x32x16xf32> - %pack = tensor.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> + %pack = linalg.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> return %pack : tensor<4x32x16xf32> } } @@ -428,7 +428,7 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: outs(%[[GENERIC_OUT_SLICE]] : // CHECK: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV1]]) // CHECK: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] -// CHECK: %[[TILED_PACK_OUT:.*]] = tensor.pack %[[GENERIC_OUT]] +// CHECK: %[[TILED_PACK_OUT:.*]] = linalg.pack %[[GENERIC_OUT]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [16] // CHECK-SAME: into %[[TILED_PACK_DEST]] // CHECK: scf.forall.in_parallel { diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir index 5f7663af773a4..bc27840fdf5e9 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir @@ -591,7 +591,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @imperfect_unpack_producer_fusion(%source: tensor<1x1x288x8x4xf32>, %dest: tensor<1x2x1152xf32>) -> tensor<1x2x1152xf32> { - %0 = tensor.unpack %source + %0 = linalg.unpack %source outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 4] into %dest @@ -625,7 +625,7 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[ARG1:.+]]: tensor<1x2x1152xf32> // CHECK: %[[FOR_RESULT:.+]] = scf.for{{.*}}iter_args(%[[ITER_ARG:.+]] = {{.*}}) // CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[SLICE]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SLICE]] // CHECK-DAG: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] // CHECK-DAG: %[[INIT_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]] // CHECK: %[[GENERIC:.+]] = linalg.generic diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir index 5133c14414c97..c1604e226a334 100644 --- a/mlir/test/Transforms/loop-invariant-code-motion.mlir +++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir @@ -1163,18 +1163,18 @@ func.func @speculate_ceildivsi_range( func.func @speculate_static_pack_and_unpack(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>, %lb: index, %ub: index, %step: index) { - // CHECK: tensor.pack + // CHECK: linalg.pack // CHECK-NEXT: scf.for scf.for %i = %lb to %ub step %step { - %packed = tensor.pack %source + %packed = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> } - // CHECK: tensor.unpack + // CHECK: linalg.unpack // CHECK-NEXT: scf.for scf.for %i = %lb to %ub step %step { - %unpacked = tensor.unpack %dest + %unpacked = linalg.unpack %dest inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> } @@ -1188,25 +1188,25 @@ func.func @speculate_dynamic_pack_and_unpack(%source: tensor, %tile_m: index, %tile_n: index, %pad: f32) { // CHECK: scf.for - // CHECK-NEXT: tensor.pack + // CHECK-NEXT: linalg.pack scf.for %i = %lb to %ub step %step { - %packed = tensor.pack %source + %packed = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor } // CHECK: scf.for - // CHECK-NEXT: tensor.unpack + // CHECK-NEXT: linalg.unpack scf.for %i = %lb to %ub step %step { - %unpacked = tensor.unpack %dest + %unpacked = linalg.unpack %dest inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %source : tensor -> tensor } - // CHECK: tensor.pack + // CHECK: linalg.pack // CHECK-NEXT: scf.for scf.for %i = %lb to %ub step %step { - %packed = tensor.pack %source padding_value(%pad : f32) + %packed = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor } diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp index fa2a27dcfa991..046b9a65f3359 100644 --- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp @@ -74,8 +74,9 @@ struct TestLinalgTransforms *this, "test-decompose-pad-tensor", llvm::cl::desc("Test transform pad tensor by copying with generic ops"), llvm::cl::init(false)}; + // TODO: This is not used - delete. Option testDecomposeTensorPackOp{ - *this, "test-decompose-tensor-pack", + *this, "test-decompose-linalg-pack", llvm::cl::desc("Test transform that generalizes pack ops into a sequence " "of tensor and Linalg ops"), llvm::cl::init(false)}; @@ -130,6 +131,14 @@ struct TestLinalgTransforms Option testDecomposeWinogradOps{ *this, "test-decompose-winograd-ops", llvm::cl::desc("Test decompose Winograd ops"), llvm::cl::init(false)}; + Option testFoldIntoPackAndUnpack{ + *this, "test-fold-into-pack-and-unpack", + llvm::cl::desc("Test folding ops into linalg.pack and linalg.unpack"), + llvm::cl::init(false)}; + Option testSimplifyPackUnpackPatterns{ + *this, "test-simplify-pack-unpack-patterns", + llvm::cl::desc("Test patterns to simplify linalg.pack and linalg.unpack"), + llvm::cl::init(false)}; }; } // namespace @@ -227,6 +236,18 @@ static void applyDecomposeWinogradOps(func::FuncOp funcOp) { (void)applyPatternsGreedily(funcOp, std::move(patterns)); } +static void applyFoldIntoPackAndUnpackPatterns(Operation *rootOp) { + RewritePatternSet patterns(rootOp->getContext()); + linalg::populateFoldIntoPackAndUnpackPatterns(patterns); + (void)applyPatternsGreedily(rootOp, std::move(patterns)); +} + +static void applySimplifyPackUnpackPatterns(Operation *rootOp) { + RewritePatternSet patterns(rootOp->getContext()); + linalg::populateSimplifyPackAndUnpackPatterns(patterns); + (void)applyPatternsGreedily(rootOp, std::move(patterns)); +} + /// Apply transformations specified as patterns. void TestLinalgTransforms::runOnOperation() { if (testPatterns) @@ -255,6 +276,11 @@ void TestLinalgTransforms::runOnOperation() { return applyWinogradConv2D(getOperation()); if (testDecomposeWinogradOps) return applyDecomposeWinogradOps(getOperation()); + Operation *rootOp = getOperation(); + if (testFoldIntoPackAndUnpack) + applyFoldIntoPackAndUnpackPatterns(rootOp); + if (testSimplifyPackUnpackPatterns) + applySimplifyPackUnpackPatterns(rootOp); } namespace mlir { diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp index 173bfd8955f2b..e435130c2a417 100644 --- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp +++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp @@ -77,11 +77,6 @@ struct TestTensorTransforms llvm::cl::desc("Test folding of expand_shape/collapse_shape"), llvm::cl::init(false)}; - Option testFoldIntoPackAndUnpack{ - *this, "test-fold-into-pack-and-unpack", - llvm::cl::desc("Test folding ops into tensor.pack and tensor.unpack"), - llvm::cl::init(false)}; - Option useForeach{ *this, "use-foreach", llvm::cl::desc( @@ -89,11 +84,6 @@ struct TestTensorTransforms "the extract_slice of collapse_shape pattern"), llvm::cl::init(false)}; - Option testSimplifyPackUnpackPatterns{ - *this, "test-simplify-pack-unpack-patterns", - llvm::cl::desc("Test patterns to simplify tensor.pack and tensor.unpack"), - llvm::cl::init(false)}; - Option testTrackingListener{ *this, "test-tracking-listener", llvm::cl::desc("Test tensor TrackingListener for the transform dialect"), @@ -113,12 +103,6 @@ static void applyBubbleUpExpandShapePatterns(Operation *rootOp) { (void)applyPatternsGreedily(rootOp, std::move(patterns)); } -static void applyFoldIntoPackAndUnpackPatterns(Operation *rootOp) { - RewritePatternSet patterns(rootOp->getContext()); - tensor::populateFoldIntoPackAndUnpackPatterns(patterns); - (void)applyPatternsGreedily(rootOp, std::move(patterns)); -} - static void applyFoldConstantExtractSlicePatterns(Operation *rootOp) { RewritePatternSet patterns(rootOp->getContext()); tensor::ControlConstantExtractSliceFusionFn controlFn = @@ -148,12 +132,6 @@ applyDropRedundantInsertSliceRankExpansionPatterns(Operation *rootOp) { (void)applyPatternsGreedily(rootOp, std::move(patterns)); } -static void applySimplifyPackUnpackPatterns(Operation *rootOp) { - RewritePatternSet patterns(rootOp->getContext()); - tensor::populateSimplifyPackAndUnpackPatterns(patterns); - (void)applyPatternsGreedily(rootOp, std::move(patterns)); -} - namespace { /// Base pattern to rewrite a `tensor.collapse_shape -> tensor.extract_slice`. /// The `tensor.extract_slice` is replaced by a loop or gather operation that @@ -387,8 +365,6 @@ static LogicalResult testTrackingListenerReplacements(Operation *rootOp) { void TestTensorTransforms::runOnOperation() { Operation *rootOp = getOperation(); - if (testSimplifyPackUnpackPatterns) - applySimplifyPackUnpackPatterns(rootOp); if (testFoldConstantExtractSlice) applyFoldConstantExtractSlicePatterns(rootOp); if (testFoldConsecutiveInsertExtractSlice) @@ -399,8 +375,6 @@ void TestTensorTransforms::runOnOperation() { applyReassociativeReshapeFoldingPatterns(rootOp); if (testBubbleUpExpandShapePatterns) applyBubbleUpExpandShapePatterns(rootOp); - if (testFoldIntoPackAndUnpack) - applyFoldIntoPackAndUnpackPatterns(rootOp); if (testRewriteExtractSliceWithTiledCollapseShape) { if (failed( applyRewriteExtractFromCollapseShapePatterns(rootOp, useForeach))) From 02c44ce6c6b2ec595e863a2cc8eacbe11c579d7c Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Mon, 17 Feb 2025 04:55:09 -0600 Subject: [PATCH 007/127] Reformat reglists in SystemZMCTargetDesc.cpp (NFC) (#127472) --- .../MCTargetDesc/SystemZMCTargetDesc.cpp | 125 ++++++++---------- 1 file changed, 53 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index f6951c39ce9be..e84368c769e29 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -35,100 +35,81 @@ using namespace llvm; #include "SystemZGenRegisterInfo.inc" const unsigned SystemZMC::GR32Regs[16] = { - SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L, - SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L, - SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L, - SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L -}; + SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L, + SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L, + SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L, + SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L}; const unsigned SystemZMC::GRH32Regs[16] = { - SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H, - SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H, - SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H, - SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H -}; + SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H, + SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H, + SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H, + SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H}; const unsigned SystemZMC::GR64Regs[16] = { - SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D, - SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D, - SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D, - SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D -}; + SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D, + SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D, + SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D, + SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D}; const unsigned SystemZMC::GR128Regs[16] = { - SystemZ::R0Q, 0, SystemZ::R2Q, 0, - SystemZ::R4Q, 0, SystemZ::R6Q, 0, - SystemZ::R8Q, 0, SystemZ::R10Q, 0, - SystemZ::R12Q, 0, SystemZ::R14Q, 0 -}; + SystemZ::R0Q, 0, SystemZ::R2Q, 0, SystemZ::R4Q, 0, SystemZ::R6Q, 0, + SystemZ::R8Q, 0, SystemZ::R10Q, 0, SystemZ::R12Q, 0, SystemZ::R14Q, 0}; const unsigned SystemZMC::FP32Regs[16] = { - SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, - SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, - SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S, - SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S -}; + SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, + SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, + SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S, + SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S}; const unsigned SystemZMC::FP64Regs[16] = { - SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, - SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, - SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D, - SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D -}; + SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, + SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, + SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D, + SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D}; const unsigned SystemZMC::FP128Regs[16] = { - SystemZ::F0Q, SystemZ::F1Q, 0, 0, - SystemZ::F4Q, SystemZ::F5Q, 0, 0, - SystemZ::F8Q, SystemZ::F9Q, 0, 0, - SystemZ::F12Q, SystemZ::F13Q, 0, 0 -}; + SystemZ::F0Q, SystemZ::F1Q, 0, 0, SystemZ::F4Q, SystemZ::F5Q, 0, 0, + SystemZ::F8Q, SystemZ::F9Q, 0, 0, SystemZ::F12Q, SystemZ::F13Q, 0, 0}; const unsigned SystemZMC::VR32Regs[32] = { - SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, - SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, - SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S, - SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S, - SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S, - SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S, - SystemZ::F24S, SystemZ::F25S, SystemZ::F26S, SystemZ::F27S, - SystemZ::F28S, SystemZ::F29S, SystemZ::F30S, SystemZ::F31S -}; + SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, SystemZ::F4S, + SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, SystemZ::F8S, SystemZ::F9S, + SystemZ::F10S, SystemZ::F11S, SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, + SystemZ::F15S, SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S, + SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S, SystemZ::F24S, + SystemZ::F25S, SystemZ::F26S, SystemZ::F27S, SystemZ::F28S, SystemZ::F29S, + SystemZ::F30S, SystemZ::F31S}; const unsigned SystemZMC::VR64Regs[32] = { - SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, - SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, - SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D, - SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D, - SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D, - SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D, - SystemZ::F24D, SystemZ::F25D, SystemZ::F26D, SystemZ::F27D, - SystemZ::F28D, SystemZ::F29D, SystemZ::F30D, SystemZ::F31D -}; + SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, SystemZ::F4D, + SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, SystemZ::F8D, SystemZ::F9D, + SystemZ::F10D, SystemZ::F11D, SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, + SystemZ::F15D, SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D, + SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D, SystemZ::F24D, + SystemZ::F25D, SystemZ::F26D, SystemZ::F27D, SystemZ::F28D, SystemZ::F29D, + SystemZ::F30D, SystemZ::F31D}; const unsigned SystemZMC::VR128Regs[32] = { - SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3, - SystemZ::V4, SystemZ::V5, SystemZ::V6, SystemZ::V7, - SystemZ::V8, SystemZ::V9, SystemZ::V10, SystemZ::V11, - SystemZ::V12, SystemZ::V13, SystemZ::V14, SystemZ::V15, - SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19, - SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23, - SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27, - SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31 -}; + SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3, SystemZ::V4, + SystemZ::V5, SystemZ::V6, SystemZ::V7, SystemZ::V8, SystemZ::V9, + SystemZ::V10, SystemZ::V11, SystemZ::V12, SystemZ::V13, SystemZ::V14, + SystemZ::V15, SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19, + SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23, SystemZ::V24, + SystemZ::V25, SystemZ::V26, SystemZ::V27, SystemZ::V28, SystemZ::V29, + SystemZ::V30, SystemZ::V31}; const unsigned SystemZMC::AR32Regs[16] = { - SystemZ::A0, SystemZ::A1, SystemZ::A2, SystemZ::A3, - SystemZ::A4, SystemZ::A5, SystemZ::A6, SystemZ::A7, - SystemZ::A8, SystemZ::A9, SystemZ::A10, SystemZ::A11, - SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15 -}; + SystemZ::A0, SystemZ::A1, SystemZ::A2, SystemZ::A3, + SystemZ::A4, SystemZ::A5, SystemZ::A6, SystemZ::A7, + SystemZ::A8, SystemZ::A9, SystemZ::A10, SystemZ::A11, + SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15}; const unsigned SystemZMC::CR64Regs[16] = { - SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3, - SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7, - SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11, - SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15 -}; + SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3, + SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7, + SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11, + SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15}; unsigned SystemZMC::getFirstReg(unsigned Reg) { static unsigned Map[SystemZ::NUM_TARGET_REGS]; From 837b89fc0fc6d0ae7f68e835789ee62580314dcc Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Mon, 17 Feb 2025 12:09:27 +0100 Subject: [PATCH 008/127] [MLIR][NVVM] Add `ptxas-cmd-options` to pass flags to the downstream compiler (#127457) This PR adds `cmd-options` to the `gpu-lower-to-nvvm-pipeline` pipeline and the `nvvm-attach-target` pass, allowing users to pass flags to the downstream compiler, *ptxas*. Example: ``` mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8'" ``` --- .../Dialect/GPU/IR/CompilationInterfaces.h | 4 +++ .../mlir/Dialect/GPU/Pipelines/Passes.h | 5 ++++ .../mlir/Dialect/GPU/Transforms/Passes.td | 3 ++ mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 8 ++++++ mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 7 ++++- .../GPU/Pipelines/GPUToNVVMPipeline.cpp | 1 + .../GPU/Transforms/NVVMAttachTarget.cpp | 18 +++++++++++- mlir/lib/Target/LLVM/NVVM/Target.cpp | 28 +++++++++++++++++-- mlir/test/Dialect/GPU/nvvm-attach-target.mlir | 15 ++++++++++ .../GPU/CUDA/command-line-arg.mlir | 21 ++++++++++++++ 10 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 mlir/test/Dialect/GPU/nvvm-attach-target.mlir create mode 100644 mlir/test/Integration/GPU/CUDA/command-line-arg.mlir diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h index c950ef220f692..9a890ae24d8fc 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h +++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h @@ -108,6 +108,10 @@ class TargetOptions { /// Returns the default compilation target: `CompilationTarget::Fatbin`. static CompilationTarget getDefaultCompilationTarget(); + /// Returns a tokenization of the command line options. + static std::pair> + tokenizeCmdOptions(const std::string &cmdOptions); + protected: /// Derived classes must use this constructor to initialize `typeID` to the /// appropiate value: ie. `TargetOptions(TypeID::get())`. diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h index caa0901bb4943..035235fc7174a 100644 --- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h @@ -37,6 +37,11 @@ struct GPUToNVVMPipelineOptions *this, "cubin-format", llvm::cl::desc("Compilation format to use to serialize to cubin."), llvm::cl::init("fatbin")}; + PassOptions::Option cmdOptions{ + *this, "ptxas-cmd-options", + llvm::cl::desc( + "Command line options to pass to the downstream compiler."), + llvm::cl::init("")}; PassOptions::Option optLevel{ *this, "opt-level", llvm::cl::desc("Optimization level for NVVM compilation"), diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td index e055164a1c384..faf4c9ddbc7a7 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -143,6 +143,9 @@ def GpuNVVMAttachTarget: Pass<"nvvm-attach-target", ""> { "Enable flush to zero for denormals.">, ListOption<"linkLibs", "l", "std::string", "Extra bitcode libraries paths to link to.">, + Option<"cmdOptions", "ptxas-cmd-options", "std::string", + /*default=*/ [{""}], + "Command line options passed to downstream compiler">, ]; } diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index fe15a524ec3b5..0de5a87e72c3f 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -2862,6 +2862,8 @@ def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target"> { bool hasFlag(StringRef flag) const; bool hasFastMath() const; bool hasFtz() const; + bool hasCmdOptions() const; + std::optional getCmdOptions() const; }]; let extraClassDefinition = [{ bool $cppClass::hasFlag(StringRef flag) const { @@ -2875,6 +2877,12 @@ def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target"> { bool $cppClass::hasFtz() const { return hasFlag("ftz"); } + bool $cppClass::hasCmdOptions() const { + return hasFlag("ptxas-cmd-options"); + } + std::optional $cppClass::getCmdOptions() const { + return getFlags().getNamed("ptxas-cmd-options"); + } }]; } diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index d06f10d3137a1..1bdeb3e356f4b 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -2564,7 +2564,7 @@ CompilationTarget TargetOptions::getDefaultCompilationTarget() { } std::pair> -TargetOptions::tokenizeCmdOptions() const { +TargetOptions::tokenizeCmdOptions(const std::string &cmdOptions) { std::pair> options; llvm::StringSaver stringSaver(options.first); StringRef opts = cmdOptions; @@ -2586,6 +2586,11 @@ TargetOptions::tokenizeCmdOptions() const { return options; } +std::pair> +TargetOptions::tokenizeCmdOptions() const { + return tokenizeCmdOptions(cmdOptions); +} + MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions) #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc" diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp index 8dcf6bab127a6..78ff31a75ca4c 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp @@ -58,6 +58,7 @@ void buildCommonPassPipeline( nvvmTargetOptions.chip = options.cubinChip; nvvmTargetOptions.features = options.cubinFeatures; nvvmTargetOptions.optLevel = options.optLevel; + nvvmTargetOptions.cmdOptions = options.cmdOptions; pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions)); pm.addPass(createLowerAffinePass()); pm.addPass(createArithToLLVMConversionPass()); diff --git a/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp index dd705cd338312..a6f7464012f3a 100644 --- a/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp @@ -45,7 +45,7 @@ struct NVVMAttachTarget DictionaryAttr NVVMAttachTarget::getFlags(OpBuilder &builder) const { UnitAttr unitAttr = builder.getUnitAttr(); - SmallVector flags; + SmallVector flags; auto addFlag = [&](StringRef flag) { flags.push_back(builder.getNamedAttr(flag, unitAttr)); }; @@ -53,6 +53,22 @@ DictionaryAttr NVVMAttachTarget::getFlags(OpBuilder &builder) const { addFlag("fast"); if (ftzFlag) addFlag("ftz"); + + // Tokenize and set the optional command line options. + if (!cmdOptions.empty()) { + auto options = gpu::TargetOptions::tokenizeCmdOptions(cmdOptions); + if (!options.second.empty()) { + llvm::SmallVector nvvmOptionAttrs; + for (const char *opt : options.second) { + nvvmOptionAttrs.emplace_back( + mlir::StringAttr::get(builder.getContext(), StringRef(opt))); + } + flags.push_back(builder.getNamedAttr( + "ptxas-cmd-options", + mlir::ArrayAttr::get(builder.getContext(), nvvmOptionAttrs))); + } + } + if (!flags.empty()) return builder.getDictionaryAttr(flags); return nullptr; diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp index e240a7ae4917f..fa8c597da58b1 100644 --- a/mlir/lib/Target/LLVM/NVVM/Target.cpp +++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp @@ -321,6 +321,25 @@ std::optional NVPTXSerializer::findTool(StringRef tool) { return std::nullopt; } +/// Adds optional command-line arguments to existing arguments. +template +static void setOptionalCommandlineArguments(NVVMTargetAttr target, + SmallVectorImpl &ptxasArgs) { + if (!target.hasCmdOptions()) + return; + + std::optional cmdOptions = target.getCmdOptions(); + for (Attribute attr : cast(cmdOptions->getValue())) { + if (auto strAttr = dyn_cast(attr)) { + if constexpr (std::is_same_v) { + ptxasArgs.push_back(strAttr.getValue()); + } else if constexpr (std::is_same_v) { + ptxasArgs.push_back(strAttr.getValue().data()); + } + } + } +} + // TODO: clean this method & have a generic tool driver or never emit binaries // with this mechanism and let another stage take care of it. std::optional> @@ -359,8 +378,8 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) { return std::nullopt; TmpFile cubinFile; if (createFatbin) { - Twine cubinFilename = ptxFile->first + ".cubin"; - cubinFile = TmpFile(cubinFilename.str(), llvm::FileRemover(cubinFilename)); + std::string cubinFilename = (ptxFile->first + ".cubin").str(); + cubinFile = TmpFile(cubinFilename, llvm::FileRemover(cubinFilename)); } else { cubinFile.first = binaryFile->first; } @@ -412,6 +431,9 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) { useFatbin32 = true; } + // Set optional command line arguments + setOptionalCommandlineArguments(getTarget(), ptxasArgs); + // Create the `fatbinary` args. StringRef chip = getTarget().getChip(); // Remove the arch prefix to obtain the compute capability. @@ -562,6 +584,8 @@ NVPTXSerializer::compileToBinaryNVPTX(const std::string &ptxCode) { cmdOpts.second.append( {"-arch", getTarget().getChip().data(), "--opt-level", optLevel.c_str()}); + // Set optional command line arguments + setOptionalCommandlineArguments(getTarget(), cmdOpts.second); // Create the compiler handle. RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerCreate(&compiler, ptxCode.size(), ptxCode.c_str())); diff --git a/mlir/test/Dialect/GPU/nvvm-attach-target.mlir b/mlir/test/Dialect/GPU/nvvm-attach-target.mlir new file mode 100644 index 0000000000000..35450e0ad6b1b --- /dev/null +++ b/mlir/test/Dialect/GPU/nvvm-attach-target.mlir @@ -0,0 +1,15 @@ +// RUN: mlir-opt %s --nvvm-attach-target="" | FileCheck %s +// RUN: mlir-opt %s --nvvm-attach-target="ptxas-cmd-options=--register-usage-level=8" | FileCheck %s -check-prefix=CHECK-OPTIONS + +module attributes {gpu.container_module} { + // CHECK-LABEL:gpu.module @kernel_module1 + // CHECK: [#nvvm.target] + // CHECK-OPTIONS: [#nvvm.target] + gpu.module @kernel_module1 { + llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, + %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, + %arg5: i64) attributes {gpu.kernel} { + llvm.return + } + } +} diff --git a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir new file mode 100644 index 0000000000000..34dde6e03c80e --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir @@ -0,0 +1,21 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8'" -debug-only=serialize-to-binary \ +// RUN: 2>&1 | FileCheck %s + +func.func @host_function(%arg0 : f32, %arg1 : memref) { + %cst = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst2 = memref.dim %arg1, %c0 : memref + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst) + threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) { + memref.store %arg0, %arg1[%tx] : memref + gpu.terminator + } + + return +} + +// CHECK: ptxas -arch sm_80 +// CHECK-SAME: -v +// CHECK-SAME: --register-usage-level=8 From a177be5528337575ee1b7079958d4250b2eb749f Mon Sep 17 00:00:00 2001 From: gdehame <145553531+gdehame@users.noreply.github.com> Date: Mon, 17 Feb 2025 12:20:58 +0100 Subject: [PATCH 009/127] [mlir][Linalg] Bugfix in decompose generic by unfolding permutation (#126737) The pattern was returning success() by default which made the greedy pattern application act as if the IR was modified and even though nothing was changed and thus it can prevent it from converging for no legitimate reason. The patch makes the rewrite pattern return failure() by default and success() if and only if the IR changed. An example of unexpected behavior is by running `mlir-opt input.mlir --linalg-specialize-generic-ops`, we obtain an empty mlir as output with `input.mlir` as follows: ``` #map = affine_map<(d0) -> (d0)> func.func @f(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> tensor<8xi32> { %0 = tensor.empty() : tensor<8xi32> %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %arg1: tensor<8xi32>, tensor<8xi32>) outs(%0: tensor<8xi32>) { ^bb0(%in: i32, %in_0: i32, %out: i32): %2 = arith.addi %in, %in_0: i32 linalg.yield %2: i32 } -> tensor<8xi32> return %1 : tensor<8xi32> } ``` --- ...DecomposeGenericByUnfoldingPermutation.cpp | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp index 83c4b5bdf1097..ae8cb94661c76 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp @@ -223,21 +223,21 @@ LogicalResult DecomposeProjectedPermutation::matchAndRewrite( newMap[i] = rewriter.getMultiDimIdentityMap(map.getNumDims()); } - if (isChanged) { - SmallVector operands = op->getOperands(); - ValueRange operandsRef(operands); - - auto newOp = rewriter.create( - /*location=*/op.getLoc(), - /*resultTensorTypes=*/op->getResultTypes(), - /*inputs=*/newInitValues, - /*outputs=*/operandsRef.drop_front(op.getNumDpsInputs()), - /*indexingMaps=*/newMap, - /*iteratorTypes=*/op.getIteratorTypesArray()); - - newOp.getRegion().takeBody(op->getRegion(0)); - rewriter.replaceOp(op, newOp->getResults()); - } + if (!isChanged) + return failure(); + + SmallVector operands = op->getOperands(); + ValueRange operandsRef(operands); + + auto newOp = rewriter.create( + /*location=*/op.getLoc(), + /*resultTensorTypes=*/op->getResultTypes(), + /*inputs=*/newInitValues, + /*outputs=*/operandsRef.drop_front(op.getNumDpsInputs()), + /*indexingMaps=*/newMap, + /*iteratorTypes=*/op.getIteratorTypesArray()); + newOp.getRegion().takeBody(op->getRegion(0)); + rewriter.replaceOp(op, newOp->getResults()); return success(); } From 80b08d1bb803a2ee0af7ae5661dc8f2444d97a41 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Mon, 17 Feb 2025 11:32:47 +0000 Subject: [PATCH 010/127] [TableGen] Add support for per-write cycle tunables (#125870) This patch adds support for describing per-write resource cycle counts for ReadAdvance records via a new optional field called `tunables`. This makes it possible to declare ReadAdvance records such as: def : ReadAdvance; The above will effectively declare two entries in the ReadAdvance table for Read_C, one for Write_A with a cycle count of 1+2, and one for Write_B with a cycle count of 1+0 (omitted values are assumed 0). The field `tunables` provides a list of deltas relative to the base `cycle` count of the ReadAdvance. Since the field is optional and defaults to a list of 0's, this change doesn't affect current targets. --- llvm/include/llvm/Target/TargetSchedule.td | 16 +++++--- llvm/test/TableGen/PerWriteCycleCount.td | 48 ++++++++++++++++++++++ llvm/utils/TableGen/SubtargetEmitter.cpp | 16 +++++--- 3 files changed, 69 insertions(+), 11 deletions(-) create mode 100644 llvm/test/TableGen/PerWriteCycleCount.td diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td index 2562ed0901303..f55bff16dcecd 100644 --- a/llvm/include/llvm/Target/TargetSchedule.td +++ b/llvm/include/llvm/Target/TargetSchedule.td @@ -321,9 +321,13 @@ class SchedWriteRes resources> : SchedWrite, // Define values common to ReadAdvance and SchedReadAdvance. // // SchedModel ties these resources to a processor. -class ProcReadAdvance writes = []> { +class ProcReadAdvance writes = [], + list tunables = []> { + assert !le(!size(tunables), !size(writes)), + "cannot have more `tunables' than `writes'"; int Cycles = cycles; list ValidWrites = writes; + list CycleTunables = tunables; // Allow a processor to mark some scheduling classes as unsupported // for stronger verification. bit Unsupported = false; @@ -340,15 +344,17 @@ class ProcReadAdvance writes = []> { // indicate operands that are always read this number of Cycles later // than a normal register read, allowing the read's parent instruction // to issue earlier relative to the writer. -class ReadAdvance writes = []> - : ProcReadAdvance { +class ReadAdvance writes = [], + list tunables = []> + : ProcReadAdvance { SchedRead ReadType = read; } // Directly associate a new SchedRead type with a delay and optional // pipeline bypass. For use with InstRW or ItinRW. -class SchedReadAdvance writes = []> : SchedRead, - ProcReadAdvance; +class SchedReadAdvance writes = [], + list tunables = []> + : SchedRead, ProcReadAdvance; // Define SchedRead defaults. Reads seldom need special treatment. def ReadDefault : SchedRead; diff --git a/llvm/test/TableGen/PerWriteCycleCount.td b/llvm/test/TableGen/PerWriteCycleCount.td new file mode 100644 index 0000000000000..ac60d8c438834 --- /dev/null +++ b/llvm/test/TableGen/PerWriteCycleCount.td @@ -0,0 +1,48 @@ +// RUN: llvm-tblgen -gen-subtarget -I %p/../../include %s 2>&1 | FileCheck %s +// RUN: not llvm-tblgen -gen-subtarget -I %p/../../include -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s + +// Make sure that ReadAdvance entries with multiple writes are correctly +// handled. + +include "llvm/Target/Target.td" + +def MyTarget : Target; + +let OutOperandList = (outs), InOperandList = (ins) in { + def Inst_A : Instruction; + def Inst_B : Instruction; + def Inst_C : Instruction; +} + +let CompleteModel = 0 in { + def SchedModel_A: SchedMachineModel; +} + +def Read_D : SchedRead; +def Read_E : SchedRead; + +// CHECK: extern const llvm::MCReadAdvanceEntry MyTargetReadAdvanceTable[] = { +// CHECK-NEXT: {0, 0, 0}, // Invalid +// CHECK-NEXT: {0, 1, 1}, // #1 +// CHECK-NEXT: {0, 2, 3}, // #2 +// CHECK-NEXT: {0, 3, 2} // #3 +// CHECK-NEXT: }; // MyTargetReadAdvanceTable + +let SchedModel = SchedModel_A in { + def Write_A : SchedWriteRes<[]>; + def Write_B : SchedWriteRes<[]>; + def Write_C : SchedWriteRes<[]>; + + def : InstRW<[Write_A], (instrs Inst_A)>; + def : InstRW<[Write_B], (instrs Inst_B)>; + def : InstRW<[Write_C, Read_D], (instrs Inst_C)>; + + def : ReadAdvance; + +#ifdef ERROR1 +// ERROR1: error: assertion failed: cannot have more `tunables' than `writes' + def : ReadAdvance; +#endif +} + +def ProcessorA: ProcessorModel<"ProcessorA", SchedModel_A, []>; diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp index 49362ff5ef655..aec05f1ae7742 100644 --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -1308,23 +1308,27 @@ void SubtargetEmitter::genSchedClassTables(const CodeGenProcModel &ProcModel, } ConstRecVec ValidWrites = ReadAdvance->getValueAsListOfDefs("ValidWrites"); - IdxVec WriteIDs; + std::vector CycleTunables = + ReadAdvance->getValueAsListOfInts("CycleTunables"); + std::vector> WriteIDs; + assert(CycleTunables.size() <= ValidWrites.size() && "Bad ReadAdvance"); + CycleTunables.resize(ValidWrites.size(), 0); if (ValidWrites.empty()) - WriteIDs.push_back(0); + WriteIDs.emplace_back(0, 0); else { - for (const Record *VW : ValidWrites) { + for (const auto [VW, CT] : zip_equal(ValidWrites, CycleTunables)) { unsigned WriteID = SchedModels.getSchedRWIdx(VW, /*IsRead=*/false); assert(WriteID != 0 && "Expected a valid SchedRW in the list of ValidWrites"); - WriteIDs.push_back(WriteID); + WriteIDs.emplace_back(WriteID, CT); } } llvm::sort(WriteIDs); - for (unsigned W : WriteIDs) { + for (const auto &[W, T] : WriteIDs) { MCReadAdvanceEntry RAEntry; RAEntry.UseIdx = UseIdx; RAEntry.WriteResourceID = W; - RAEntry.Cycles = ReadAdvance->getValueAsInt("Cycles"); + RAEntry.Cycles = ReadAdvance->getValueAsInt("Cycles") + T; ReadAdvanceEntries.push_back(RAEntry); } } From 43d308dd0d9ef18d35ea6dcc9283fcbc93066820 Mon Sep 17 00:00:00 2001 From: Victor Campos Date: Mon, 17 Feb 2025 11:43:36 +0000 Subject: [PATCH 011/127] [compiler-rt] Add support for big endian for Arm's __negdf2vfp (#127096) In soft floating-point ABI, this function takes the double argument as a pair of registers r0 and r1. The ordering of these two registers follow the endianness rules, therefore the register on which the bit flipping must happen depends on the endianness. --- compiler-rt/lib/builtins/arm/negdf2vfp.S | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/builtins/arm/negdf2vfp.S b/compiler-rt/lib/builtins/arm/negdf2vfp.S index b7cf91877e38c..329c6de757f68 100644 --- a/compiler-rt/lib/builtins/arm/negdf2vfp.S +++ b/compiler-rt/lib/builtins/arm/negdf2vfp.S @@ -20,7 +20,11 @@ DEFINE_COMPILERRT_FUNCTION(__negdf2vfp) #if defined(COMPILER_RT_ARMHF_TARGET) vneg.f64 d0, d0 #else - eor r1, r1, #-2147483648 // flip sign bit on double in r0/r1 pair +#if _YUGA_BIG_ENDIAN + eor r0, r0, #0x80000000 // flip sign bit on double in r0/r1 pair +#else + eor r1, r1, #0x80000000 // flip sign bit on double in r0/r1 pair +#endif #endif bx lr END_COMPILERRT_FUNCTION(__negdf2vfp) From 4c4fc4650fa66361f19f8c8b60768987fe48a90f Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Mon, 17 Feb 2025 12:53:12 +0100 Subject: [PATCH 012/127] [Flang-RT] Build libflang_rt.so (#121782) Under non-Windows platforms, also create a dynamic library version of the runtime. Build of either version of the library can be switched on using FLANG_RT_ENABLE_STATIC=ON respectively FLANG_RT_ENABLE_SHARED=ON. Default is to build only the static library, consistent with previous behaviour. This is because the way the flang driver invokes the linker, most linkers choose the dynamic library by default, if available. Building the dynamic library therefore causes flang-built executables to depend on `libflang_rt.so`, unless explicitly told otherwise. --- flang-rt/CMakeLists.txt | 30 ++ flang-rt/cmake/modules/AddFlangRT.cmake | 346 ++++++++++++------ .../cmake/modules/AddFlangRTOffload.cmake | 18 +- .../ExternalHelloWorld/CMakeLists.txt | 2 +- flang-rt/lib/cuda/CMakeLists.txt | 30 +- flang-rt/lib/runtime/CMakeLists.txt | 9 +- flang-rt/test/CMakeLists.txt | 2 +- flang-rt/test/lit.cfg.py | 2 +- .../unittests/Runtime/CUDA/CMakeLists.txt | 2 +- 9 files changed, 303 insertions(+), 138 deletions(-) diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt index df35e24ec28a7..50d1a5cb2a591 100644 --- a/flang-rt/CMakeLists.txt +++ b/flang-rt/CMakeLists.txt @@ -115,6 +115,15 @@ endif () extend_path(FLANG_RT_INSTALL_RESOURCE_LIB_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH}" "${toolchain_lib_subdir}") cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_DIR) cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_PATH) +# FIXME: For the libflang_rt.so, the toolchain resource lib dir is not a good +# destination because it is not a ld.so default search path. +# The machine where the executable is eventually executed may not be the +# machine where the Flang compiler and its resource dir is installed, so +# setting RPath by the driver is not an solution. It should belong into +# /usr/lib//libflang_rt.so, like e.g. libgcc_s.so. +# But the linker as invoked by the Flang driver also requires +# libflang_rt.so to be found when linking and the resource lib dir is +# the only reliable location. cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_LIB_DIR) cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH) @@ -129,6 +138,27 @@ cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH) option(FLANG_RT_INCLUDE_TESTS "Generate build targets for the flang-rt unit and regression-tests." "${LLVM_INCLUDE_TESTS}") +option(FLANG_RT_ENABLE_STATIC "Build Flang-RT as a static library." ON) +if (WIN32) + # Windows DLL currently not implemented. + set(FLANG_RT_ENABLE_SHARED OFF) +else () + # TODO: Enable by default to increase test coverage, and which version of the + # library should be the user's choice anyway. + # Currently, the Flang driver adds `-L"libdir" -lflang_rt` as linker + # argument, which leaves the choice which library to use to the linker. + # Since most linkers prefer the shared library, this would constitute a + # breaking change unless the driver is changed. + option(FLANG_RT_ENABLE_SHARED "Build Flang-RT as a shared library." OFF) +endif () +if (NOT FLANG_RT_ENABLE_STATIC AND NOT FLANG_RT_ENABLE_SHARED) + message(FATAL_ERROR " + Must build at least one type of library + (FLANG_RT_ENABLE_STATIC=ON, FLANG_RT_ENABLE_SHARED=ON, or both) + ") +endif () + + set(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT "" CACHE STRING "Compile Flang-RT with GPU support (CUDA or OpenMP)") set_property(CACHE FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT PROPERTY STRINGS "" diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake index 630aeb3c65005..a43f1c332187a 100644 --- a/flang-rt/cmake/modules/AddFlangRT.cmake +++ b/flang-rt/cmake/modules/AddFlangRT.cmake @@ -16,7 +16,8 @@ # STATIC # Build a static (.a/.lib) library # OBJECT -# Create only object files without static/dynamic library +# Always create an object library. +# Without SHARED/STATIC, build only the object library. # INSTALL_WITH_TOOLCHAIN # Install library into Clang's resource directory so it can be found by the # Flang driver during compilation, including tests @@ -50,17 +51,73 @@ function (add_flangrt_library name) ") endif () - # Forward libtype to add_library - set(extra_args "") - if (ARG_SHARED) - list(APPEND extra_args SHARED) + # Internal names of libraries. If called with just single type option, use + # the default name for it. Name of targets must only depend on function + # arguments to be predictable for callers. + set(name_static "${name}.static") + set(name_shared "${name}.shared") + set(name_object "obj.${name}") + if (ARG_STATIC AND NOT ARG_SHARED) + set(name_static "${name}") + elseif (NOT ARG_STATIC AND ARG_SHARED) + set(name_shared "${name}") + elseif (NOT ARG_STATIC AND NOT ARG_SHARED AND ARG_OBJECT) + set(name_object "${name}") + elseif (NOT ARG_STATIC AND NOT ARG_SHARED AND NOT ARG_OBJECT) + # Only one of them will actually be built. + set(name_static "${name}") + set(name_shared "${name}") + endif () + + # Determine what to build. If not explicitly specified, honor + # BUILD_SHARED_LIBS (e.g. for unittest libraries). If can build static and + # shared, use ENABLE_STATIC/ENABLE_SHARED setting. + if (ARG_STATIC AND ARG_SHARED) + set(build_static ${FLANG_RT_ENABLE_STATIC}) + set(build_shared ${FLANG_RT_ENABLE_SHARED}) + else () + set(build_static ${ARG_STATIC}) + set(build_shared ${ARG_SHARED}) endif () - if (ARG_STATIC) - list(APPEND extra_args STATIC) + if (NOT ARG_STATIC AND NOT ARG_SHARED AND NOT ARG_OBJECT) + if (BUILD_SHARED_LIBS) + set(build_shared ON) + else () + set(build_static ON) + endif () endif () + + # Build an object library if building multiple libraries at once or if + # explicitly requested. + set(build_object OFF) if (ARG_OBJECT) - list(APPEND extra_args OBJECT) + set(build_object ON) + elseif (build_static AND build_shared) + set(build_object ON) endif () + + # srctargets: targets that contain source files + # libtargets: static/shared if they are built + # alltargets: any add_library target added by this function + set(srctargets "") + set(libtargets "") + set(alltargets "") + if (build_static) + list(APPEND srctargets "${name_static}") + list(APPEND libtargets "${name_static}") + list(APPEND alltargets "${name_static}") + endif () + if (build_shared) + list(APPEND srctargets "${name_shared}") + list(APPEND libtargets "${name_shared}") + list(APPEND alltargets "${name_shared}") + endif () + if (build_object) + set(srctargets "${name_object}") + list(APPEND alltargets "${name_object}") + endif () + + set(extra_args "") if (ARG_EXCLUDE_FROM_ALL) list(APPEND extra_args EXCLUDE_FROM_ALL) endif () @@ -68,132 +125,191 @@ function (add_flangrt_library name) # Also add header files to IDEs to list as part of the library. set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON) - add_library(${name} ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS}) + # Create selected library types. + if (build_object) + add_library("${name_object}" OBJECT ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS}) + set_target_properties(${name_object} PROPERTIES + POSITION_INDEPENDENT_CODE ON + FOLDER "Flang-RT/Object Libraries" + ) - if (ARG_INSTALL_WITH_TOOLCHAIN) - set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Toolchain Libraries") - elseif (ARG_OBJECT) - set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Object Libraries") - else () - set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Libraries") + # Replace arguments for the libraries we are going to create. + set(ARG_ADDITIONAL_HEADERS "") + set(ARG_UNPARSED_ARGUMENTS "$") + endif () + if (build_static) + add_library("${name_static}" STATIC ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS}) + endif () + if (build_shared) + add_library("${name_shared}" SHARED ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS}) endif () - # Minimum required C++ version for Flang-RT, even if CMAKE_CXX_STANDARD is defined to something else. - target_compile_features(${name} PRIVATE cxx_std_17) + if (libtargets) + # Provide a default alias which exists in either setting. + if (BUILD_SHARED_LIBS) + if (build_shared) + set(default_target "${name_shared}") + else () + set(default_target "${name_static}") + endif () + else () + if (build_static) + set(default_target "${name_static}") + else () + set(default_target "${name_shared}") + endif () + endif () + add_library(${name}.default ALIAS "${default_target}") - # Use compiler-specific options to disable exceptions and RTTI. - if (LLVM_COMPILER_IS_GCC_COMPATIBLE) - target_compile_options(${name} PRIVATE - $<$:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables> - ) - elseif (MSVC) - target_compile_options(${name} PRIVATE - $<$:/EHs-c- /GR-> - ) - elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL") - target_compile_options(${name} PRIVATE - $<$:-qnoeh -qnortti> - ) + # Provide a build target that builds any enabled library. + # Not intended for target_link_libraries. Either use the ${name}.static, + # ${name}.shared variants, or ${name}.default to let BUILD_SHARED_LIBS + # decide. + if (NOT TARGET ${name}) + add_custom_target(${name}) + add_dependencies(${name} ${libtargets}) + endif () endif () - # Also for CUDA source when compiling with FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA - if (CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA") - # Assuming gcc as host compiler. - target_compile_options(${name} PRIVATE - $<$:--no-exceptions -Xcompiler -fno-rtti -Xcompiler -fno-unwind-tables -Xcompiler -fno-asynchronous-unwind-tables> - ) - else () - # Assuming a clang-compatible CUDA compiler. - target_compile_options(${name} PRIVATE - $<$:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables> - ) - endif () + foreach (tgtname IN LISTS libtargets) + if (NOT WIN32) + # Use same stem name for .a and .so. Common in UNIX environments. + # Not possible in Windows environments. + set_target_properties(${tgtname} PROPERTIES OUTPUT_NAME "${name}") + endif () + + if (ARG_INSTALL_WITH_TOOLCHAIN) + set_target_properties(${tgtname} PROPERTIES FOLDER "Flang-RT/Toolchain Libraries") + else () + set_target_properties(${tgtname} PROPERTIES FOLDER "Flang-RT/Libraries") + endif () + endforeach () - # Flang-RT's public headers - target_include_directories(${name} PUBLIC "${FLANG_RT_SOURCE_DIR}/include") + # Define how to compile and link the library. + # Some conceptionally only apply to ${srctargets} or ${libtargets}, but we + # apply them to ${alltargets}. In worst case, they are ignored by CMake. + foreach (tgtname IN LISTS alltargets) + # Minimum required C++ version for Flang-RT, even if CMAKE_CXX_STANDARD is defined to something else. + target_compile_features(${tgtname} PRIVATE cxx_std_17) + + # Use compiler-specific options to disable exceptions and RTTI. + if (LLVM_COMPILER_IS_GCC_COMPATIBLE) + target_compile_options(${tgtname} PRIVATE + $<$:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables> + ) + elseif (MSVC) + target_compile_options(${tgtname} PRIVATE + $<$:/EHs-c- /GR-> + ) + elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL") + target_compile_options(${tgtname} PRIVATE + $<$:-qnoeh -qnortti> + ) + endif () - # For ISO_Fortran_binding.h to be found by the runtime itself (Accessed as #include "flang/ISO_Fortran_binding.h") - # User applications can use #include - target_include_directories(${name} PUBLIC "${FLANG_SOURCE_DIR}/include") + # Also for CUDA source when compiling with FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA + if (CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA") + # Assuming gcc as host compiler. + target_compile_options(${tgtname} PRIVATE + $<$:--no-exceptions -Xcompiler -fno-rtti -Xcompiler -fno-unwind-tables -Xcompiler -fno-asynchronous-unwind-tables> + ) + else () + # Assuming a clang-compatible CUDA compiler. + target_compile_options(${tgtname} PRIVATE + $<$:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables> + ) + endif () - # For Flang-RT's configured config.h to be found - target_include_directories(${name} PRIVATE "${FLANG_RT_BINARY_DIR}") + # Flang-RT's public headers + target_include_directories(${tgtname} PUBLIC "${FLANG_RT_SOURCE_DIR}/include") - # Disable libstdc++/libc++ assertions, even in an LLVM_ENABLE_ASSERTIONS - # build, to avoid an unwanted dependency on libstdc++/libc++.so. - if (FLANG_RT_SUPPORTS_UNDEFINE_FLAG) - target_compile_options(${name} PUBLIC -U_GLIBCXX_ASSERTIONS) - target_compile_options(${name} PUBLIC -U_LIBCPP_ENABLE_ASSERTIONS) - endif () + # For ISO_Fortran_binding.h to be found by the runtime itself (Accessed as #include "flang/ISO_Fortran_binding.h") + # User applications can use #include + target_include_directories(${tgtname} PUBLIC "${FLANG_SOURCE_DIR}/include") - # When building the flang runtime if LTO is enabled the archive file - # contains LLVM IR rather than object code. Currently flang is not - # LTO aware so cannot link this file to compiled Fortran code. - if (FLANG_RT_HAS_FNO_LTO_FLAG) - target_compile_options(${name} PRIVATE -fno-lto) - endif () + # For Flang-RT's configured config.h to be found + target_include_directories(${tgtname} PRIVATE "${FLANG_RT_BINARY_DIR}") - # Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI - # should only depend on msvcrt/ucrt. LLVM still emits libgcc/compiler-rt - # functions in some cases like 128-bit integer math (__udivti3, __modti3, - # __fixsfti, __floattidf, ...) that msvc does not support. We are injecting a - # dependency to Compiler-RT's builtin library where these are implemented. - if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang") - if (FLANG_RT_BUILTINS_LIBRARY) - target_compile_options(${name} PRIVATE "$<$:-Xclang>" "$<$:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>") + # Disable libstdc++/libc++ assertions, even in an LLVM_ENABLE_ASSERTIONS + # build, to avoid an unwanted dependency on libstdc++/libc++.so. + if (FLANG_RT_SUPPORTS_UNDEFINE_FLAG) + target_compile_options(${tgtname} PUBLIC -U_GLIBCXX_ASSERTIONS) + target_compile_options(${tgtname} PUBLIC -U_LIBCPP_ENABLE_ASSERTIONS) endif () - endif () - if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") - if (FLANG_RT_BUILTINS_LIBRARY) - target_compile_options(${name} PRIVATE "$<$:-Xflang>" "$<$:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>") - else () - message(WARNING "Did not find libclang_rt.builtins.lib. - LLVM may emit builtins that are not implemented in msvcrt/ucrt and - instead falls back to builtins from Compiler-RT. Linking with ${name} - may result in a linker error.") + + # When building the flang runtime if LTO is enabled the archive file + # contains LLVM IR rather than object code. Currently flang is not + # LTO aware so cannot link this file to compiled Fortran code. + if (FLANG_RT_HAS_FNO_LTO_FLAG) + target_compile_options(${tgtname} PRIVATE -fno-lto) endif () - endif () - # Non-GTest unittests depend on LLVMSupport - if (ARG_LINK_TO_LLVM) - if (LLVM_LINK_LLVM_DYLIB) - set(llvm_libs LLVM) - else() - llvm_map_components_to_libnames(llvm_libs Support) - endif() - target_link_libraries(${name} PUBLIC ${llvm_libs}) - target_include_directories(${name} PUBLIC ${LLVM_INCLUDE_DIRS}) - endif () + # Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI + # should only depend on msvcrt/ucrt. LLVM still emits libgcc/compiler-rt + # functions in some cases like 128-bit integer math (__udivti3, __modti3, + # __fixsfti, __floattidf, ...) that msvc does not support. We are injecting a + # dependency to Compiler-RT's builtin library where these are implemented. + if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang") + if (FLANG_RT_BUILTINS_LIBRARY) + target_compile_options(${tgtname} PRIVATE "$<$:-Xclang>" "$<$:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>") + endif () + endif () + if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + if (FLANG_RT_BUILTINS_LIBRARY) + target_compile_options(${tgtname} PRIVATE "$<$:-Xflang>" "$<$:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>") + else () + message(WARNING "Did not find libclang_rt.builtins.lib. + LLVM may emit builtins that are not implemented in msvcrt/ucrt and + instead falls back to builtins from Compiler-RT. Linking with ${tgtname} + may result in a linker error.") + endif () + endif () - if (ARG_INCLUDE_DIRECTORIES) - target_include_directories(${name} ${ARG_INCLUDE_DIRECTORIES}) - endif () + # Non-GTest unittests depend on LLVMSupport + if (ARG_LINK_TO_LLVM) + if (LLVM_LINK_LLVM_DYLIB) + set(llvm_libs LLVM) + else() + llvm_map_components_to_libnames(llvm_libs Support) + endif() + target_link_libraries(${tgtname} PUBLIC ${llvm_libs}) + target_include_directories(${tgtname} PUBLIC ${LLVM_INCLUDE_DIRS}) + endif () - if (ARG_LINK_LIBRARIES) - target_link_libraries(${name} PUBLIC ${ARG_LINK_LIBRARIES}) - endif () + if (ARG_INCLUDE_DIRECTORIES) + target_include_directories(${tgtname} ${ARG_INCLUDE_DIRECTORIES}) + endif () - # If this is part of the toolchain, put it into the compiler's resource - # directory. Otherwise it is part of testing and is not installed at all. - # TODO: Consider multi-configuration builds (MSVC_IDE, "Ninja Multi-Config") - if (ARG_INSTALL_WITH_TOOLCHAIN) - set_target_properties(${name} - PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}" - ) + if (ARG_LINK_LIBRARIES) + target_link_libraries(${tgtname} PUBLIC ${ARG_LINK_LIBRARIES}) + endif () + endforeach () - install(TARGETS ${name} - ARCHIVE DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}" - ) - endif () + foreach (tgtname IN LISTS libtargets) + # If this is part of the toolchain, put it into the compiler's resource + # directory. Otherwise it is part of testing and is not installed at all. + # TODO: Consider multi-configuration builds (MSVC_IDE, "Ninja Multi-Config") + if (ARG_INSTALL_WITH_TOOLCHAIN) + set_target_properties(${tgtname} + PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}" + LIBRARY_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}" + ) - if (ARG_TARGET_PROPERTIES) - set_target_properties(${name} PROPERTIES ${ARG_TARGET_PROPERTIES}) - endif () + install(TARGETS ${tgtname} + ARCHIVE DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}" + LIBRARY DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}" + ) + endif () - # flang-rt should build all the Flang-RT targets that are built in an - # 'all' build. - if (NOT ARG_EXCLUDE_FROM_ALL) - add_dependencies(flang-rt ${name}) - endif () + if (ARG_TARGET_PROPERTIES) + set_target_properties(${tgtname} PROPERTIES ${ARG_TARGET_PROPERTIES}) + endif () + + # flang-rt should build all the Flang-RT targets that are built in an + # 'all' build. + if (NOT ARG_EXCLUDE_FROM_ALL) + add_dependencies(flang-rt ${tgtname}) + endif () + endforeach () endfunction (add_flangrt_library) diff --git a/flang-rt/cmake/modules/AddFlangRTOffload.cmake b/flang-rt/cmake/modules/AddFlangRTOffload.cmake index 4e4bd60c63545..6dd0d72dc3fd7 100644 --- a/flang-rt/cmake/modules/AddFlangRTOffload.cmake +++ b/flang-rt/cmake/modules/AddFlangRTOffload.cmake @@ -8,9 +8,15 @@ macro(enable_cuda_compilation name files) if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA") + if (FLANG_RT_ENABLE_SHARED) + message(FATAL_ERROR + "FLANG_RT_ENABLE_SHARED is not supported for CUDA offload build of Flang-RT" + ) + endif() + enable_language(CUDA) - set_target_properties(${name} + set_target_properties(${name}.static PROPERTIES CUDA_SEPARABLE_COMPILATION ON ) @@ -54,7 +60,7 @@ macro(enable_cuda_compilation name files) # When using libcudacxx headers files, we have to use them # for all files of Flang-RT. if (EXISTS "${FLANG_RT_LIBCUDACXX_PATH}/include") - foreach (tgt IN ITEMS "${name}" "obj.${name}PTX") + foreach (tgt IN ITEMS "${name}.static" "obj.${name}PTX") target_include_directories(${tgt} AFTER PRIVATE "${FLANG_RT_LIBCUDACXX_PATH}/include") target_compile_definitions(${tgt} PRIVATE RT_USE_LIBCUDACXX=1) endforeach () @@ -66,6 +72,12 @@ macro(enable_omp_offload_compilation name files) if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP") # OpenMP offload build only works with Clang compiler currently. + if (FLANG_RT_ENABLE_SHARED) + message(FATAL_ERROR + "FLANG_RT_ENABLE_SHARED is not supported for OpenMP offload build of Flang-RT" + ) + endif() + if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND "${CMAKE_C_COMPILER_ID}" MATCHES "Clang") @@ -84,7 +96,7 @@ macro(enable_omp_offload_compilation name files) set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS "${OMP_COMPILE_OPTIONS}" ) - target_link_options(${name} PUBLIC ${OMP_COMPILE_OPTIONS}) + target_link_options(${name}.static PUBLIC ${OMP_COMPILE_OPTIONS}) # Enable "declare target" in the source code. set_source_files_properties(${files} diff --git a/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt b/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt index 4fd04f8f2769a..ccc39242745d9 100644 --- a/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt +++ b/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt @@ -13,5 +13,5 @@ add_llvm_example(external-hello-world target_link_libraries(external-hello-world PRIVATE - flang_rt.runtime + flang_rt.runtime.default ) diff --git a/flang-rt/lib/cuda/CMakeLists.txt b/flang-rt/lib/cuda/CMakeLists.txt index d5ca354c1029f..fc9a95bc49dc5 100644 --- a/flang-rt/lib/cuda/CMakeLists.txt +++ b/flang-rt/lib/cuda/CMakeLists.txt @@ -6,8 +6,7 @@ # #===------------------------------------------------------------------------===# - -add_flangrt_library(flang_rt.cuda STATIC +add_flangrt_library(flang_rt.cuda STATIC SHARED allocatable.cpp allocator.cpp descriptor.cpp @@ -17,18 +16,27 @@ add_flangrt_library(flang_rt.cuda STATIC memory.cpp registration.cpp - # libflang_rt.runtime depends on a certain version of CUDA. To be able to have - # multiple build of this library with different CUDA version, the version is - # added to the library name. TARGET_PROPERTIES + # libflang_rt.runtime depends on a certain version of CUDA. To be able to have + # multiple build of this library with different CUDA version, the version is + # added to the library name. OUTPUT_NAME "flang_rt.cuda_${CUDAToolkit_VERSION_MAJOR}" - INCLUDE_DIRECTORIES PRIVATE ${CUDAToolkit_INCLUDE_DIRS} ) -target_link_libraries(flang_rt.cuda - PUBLIC - flang_rt.runtime - CUDA::cudart_static -) +# For the static library, link-in the static dependencies as well. +if (TARGET flang_rt.cuda.static) + target_link_libraries(flang_rt.cuda.static PUBLIC + flang_rt.runtime.static + CUDA::cudart_static + ) +endif () + +# For the shared library, use the shared versions of the dependencies. +if (TARGET flang_rt.cuda.shared) + target_link_libraries(flang_rt.cuda.shared PUBLIC + flang_rt.runtime.shared + CUDA::cudart + ) +endif () diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt index 0afcbf2783533..589ee140485ec 100644 --- a/flang-rt/lib/runtime/CMakeLists.txt +++ b/flang-rt/lib/runtime/CMakeLists.txt @@ -128,7 +128,7 @@ set(sources ${supported_sources} ${host_sources} ${f128_sources}) if (NOT WIN32) - add_flangrt_library(flang_rt.runtime STATIC + add_flangrt_library(flang_rt.runtime STATIC SHARED ${sources} LINK_LIBRARIES ${Backtrace_LIBRARY} INSTALL_WITH_TOOLCHAIN @@ -138,10 +138,9 @@ if (NOT WIN32) enable_cuda_compilation(flang_rt.runtime "${supported_sources}") enable_omp_offload_compilation(flang_rt.runtime "${supported_sources}") - # For unittests that depend on flang_rt. Should link to the static version - # of the library. - add_library(flang_rt.runtime.static ALIAS flang_rt.runtime) - add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime) + # Select a default runtime, which is used for unit and regression tests. + get_target_property(default_target flang_rt.runtime.default ALIASED_TARGET) + add_library(flang_rt.runtime.unittest ALIAS "${default_target}") else() # Target for building all versions of the runtime add_custom_target(flang_rt.runtime) diff --git a/flang-rt/test/CMakeLists.txt b/flang-rt/test/CMakeLists.txt index f5f7b8832d381..cb48d22d3accc 100644 --- a/flang-rt/test/CMakeLists.txt +++ b/flang-rt/test/CMakeLists.txt @@ -44,8 +44,8 @@ add_custom_target(flang-rt-test-depends) set_target_properties(flang-rt-test-depends PROPERTIES FOLDER "Flang-RT/Meta") add_dependencies(flang-rt-test-depends FlangRTUnitTests - flang_rt.runtime flang_rt.runtime.unittest + flang_rt.runtime ) add_lit_testsuite(check-flang-rt "Running the Flang-RT regression tests" diff --git a/flang-rt/test/lit.cfg.py b/flang-rt/test/lit.cfg.py index 652da31e6438f..032aeef2d5bf6 100644 --- a/flang-rt/test/lit.cfg.py +++ b/flang-rt/test/lit.cfg.py @@ -92,7 +92,7 @@ def shjoin(args, sep=" "): ("%include", os.path.join(config.flang_source_dir, "include")) ) -# Library path of libflang_rt.runtime.a (for lib search path when using non-Flang driver for linking) +# Library path of libflang_rt.runtime.a/.so (for lib search path when using non-Flang driver for linking and LD_LIBRARY_PATH) config.substitutions.append(("%libdir", config.flang_rt_output_resource_lib_dir)) # For CUDA offloading, additional steps (device linking) and libraries (cudart) are needed. diff --git a/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt b/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt index cd69a6f472873..2faacfda92a84 100644 --- a/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt +++ b/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt @@ -14,5 +14,5 @@ add_flangrt_unittest(FlangCufRuntimeTests target_link_libraries(FlangCufRuntimeTests PRIVATE - flang_rt.cuda + flang_rt.cuda.default ) From 9c9157b25662cedd63426f02cdbde7853454b38e Mon Sep 17 00:00:00 2001 From: Dinu Blanovschi Date: Mon, 17 Feb 2025 13:11:36 +0100 Subject: [PATCH 013/127] Fix typo in LangImpl03.rst (#127389) --- llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst index f41c5ed0ad0cd..001a314cb1331 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst @@ -313,7 +313,7 @@ in "``TheModule``"s symbol table. Finally, we set the name of each of the function's arguments according to the names given in the Prototype. This step isn't strictly necessary, but keeping the names consistent makes the IR more readable, and allows subsequent code to -refer directly to the arguments for their names, rather than having to look up +refer directly to the arguments for their names, rather than having to look them up in the Prototype AST. At this point we have a function prototype with no body. This is how LLVM IR From 8eba128b2dac8e405b663ef602f85469c3d6edb8 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 17 Feb 2025 12:30:07 +0000 Subject: [PATCH 014/127] ConstRange: exhaustively test makeExactICmpRegion (#127058) Exhaustively test makeExactICmpRegion by comparing makeAllowedICmpRegion against makeSatisfyingICmpRegion for all APInts. --- llvm/lib/IR/ConstantRange.cpp | 7 +++---- llvm/unittests/IR/ConstantRangeTest.cpp | 11 +++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index 41e40cdf365d2..e09c139db39c8 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -170,11 +170,10 @@ ConstantRange ConstantRange::makeExactICmpRegion(CmpInst::Predicate Pred, const APInt &C) { // Computes the exact range that is equal to both the constant ranges returned // by makeAllowedICmpRegion and makeSatisfyingICmpRegion. This is always true - // when RHS is a singleton such as an APInt and so the assert is valid. - // However for non-singleton RHS, for example ult [2,5) makeAllowedICmpRegion - // returns [0,4) but makeSatisfyICmpRegion returns [0,2). + // when RHS is a singleton such as an APInt. However for non-singleton RHS, + // for example ult [2,5) makeAllowedICmpRegion returns [0,4) but + // makeSatisfyICmpRegion returns [0,2). // - assert(makeAllowedICmpRegion(Pred, C) == makeSatisfyingICmpRegion(Pred, C)); return makeAllowedICmpRegion(Pred, C); } diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp index 1bafb52d357fa..bcb5d498c8cb9 100644 --- a/llvm/unittests/IR/ConstantRangeTest.cpp +++ b/llvm/unittests/IR/ConstantRangeTest.cpp @@ -1662,6 +1662,17 @@ TEST(ConstantRange, MakeAllowedICmpRegionEdgeCases) { .isFullSet()); } +TEST(ConstantRange, MakeExactICmpRegion) { + for (unsigned Bits : {1, 4}) { + EnumerateAPInts(Bits, [](const APInt &N) { + for (auto Pred : ICmpInst::predicates()) { + EXPECT_EQ(ConstantRange::makeAllowedICmpRegion(Pred, N), + ConstantRange::makeSatisfyingICmpRegion(Pred, N)); + }; + }); + } +} + TEST(ConstantRange, MakeSatisfyingICmpRegion) { ConstantRange LowHalf(APInt(8, 0), APInt(8, 128)); ConstantRange HighHalf(APInt(8, 128), APInt(8, 0)); From 81a8b2004508a47f733bd8d1c063f7333577cf59 Mon Sep 17 00:00:00 2001 From: Abhina Sree Date: Mon, 17 Feb 2025 07:57:09 -0500 Subject: [PATCH 015/127] [SystemZ][z/OS] Define _XOPEN_SOURCE=600 for dlopen (#127254) On z/OS, dlopen is guarded by _XOPEN_SOURCE=600 so define it when checking for the symbol. --- llvm/cmake/config-ix.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index c128fd2ed125c..15ae04f5a6913 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -390,7 +390,14 @@ if (NOT PURE_WINDOWS) if( HAVE_LIBDL ) list(APPEND CMAKE_REQUIRED_LIBRARIES dl) endif() + # Add the _XOPEN_SOURCE macro on z/OS, as certain test(s) use dlopen + if (ZOS) + list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_XOPEN_SOURCE=600") + endif() check_symbol_exists(dlopen dlfcn.h HAVE_DLOPEN) + if (ZOS) + list(REMOVE_ITEM CMAKE_REQUIRED_DEFINITIONS "-D_XOPEN_SOURCE=600") + endif() if( HAVE_LIBDL ) list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES dl) endif() From f4206f92c5f900a4e0fc0f6dcab6afb6865df1e9 Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Mon, 17 Feb 2025 14:39:18 +0100 Subject: [PATCH 016/127] [libunwind] Silence -Wunused-parameter warnings in Unwind-wasm.c (#125412) --- libunwind/src/Unwind-wasm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libunwind/src/Unwind-wasm.c b/libunwind/src/Unwind-wasm.c index b18b32c5d1784..b8b7bc2779f17 100644 --- a/libunwind/src/Unwind-wasm.c +++ b/libunwind/src/Unwind-wasm.c @@ -102,8 +102,7 @@ _LIBUNWIND_EXPORT uintptr_t _Unwind_GetIP(struct _Unwind_Context *context) { } /// Not used in Wasm. -_LIBUNWIND_EXPORT void _Unwind_SetIP(struct _Unwind_Context *context, - uintptr_t value) {} +_LIBUNWIND_EXPORT void _Unwind_SetIP(struct _Unwind_Context *, uintptr_t) {} /// Called by personality handler to get LSDA for current frame. _LIBUNWIND_EXPORT uintptr_t @@ -115,8 +114,7 @@ _Unwind_GetLanguageSpecificData(struct _Unwind_Context *context) { } /// Not used in Wasm. -_LIBUNWIND_EXPORT uintptr_t -_Unwind_GetRegionStart(struct _Unwind_Context *context) { +_LIBUNWIND_EXPORT uintptr_t _Unwind_GetRegionStart(struct _Unwind_Context *) { return 0; } From 9c49b188b8e1434eb774ee8422124ad3e8870dce Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Mon, 17 Feb 2025 14:40:31 +0100 Subject: [PATCH 017/127] [clang] Fix false positive regression for lifetime analysis warning. (#127460) This fixes a false positive caused by #114044. For `GSLPointer*` types, it's less clear whether the lifetime issue is about the GSLPointer object itself or the owner it points to. To avoid false positives, we take a conservative approach in our heuristic. Fixes #127195 (This will be backported to release 20). --- clang/lib/Sema/CheckExprLifetime.cpp | 5 ++-- clang/test/Sema/Inputs/lifetime-analysis.h | 2 ++ .../Sema/warn-lifetime-analysis-nocfg.cpp | 24 +++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 8963cad86dbca..1f87001f35b57 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -1239,11 +1239,12 @@ static AnalysisResult analyzePathForGSLPointer(const IndirectLocalPath &Path, } // Check the return type, e.g. // const GSLOwner& func(const Foo& foo [[clang::lifetimebound]]) + // GSLOwner* func(cosnt Foo& foo [[clang::lifetimebound]]) // GSLPointer func(const Foo& foo [[clang::lifetimebound]]) if (FD && - ((FD->getReturnType()->isReferenceType() && + ((FD->getReturnType()->isPointerOrReferenceType() && isRecordWithAttr(FD->getReturnType()->getPointeeType())) || - isPointerLikeType(FD->getReturnType()))) + isGLSPointerType(FD->getReturnType()))) return Report; return Abandon; diff --git a/clang/test/Sema/Inputs/lifetime-analysis.h b/clang/test/Sema/Inputs/lifetime-analysis.h index d318033ff0cc4..2072e4603cead 100644 --- a/clang/test/Sema/Inputs/lifetime-analysis.h +++ b/clang/test/Sema/Inputs/lifetime-analysis.h @@ -61,6 +61,7 @@ struct basic_string_view { basic_string_view(); basic_string_view(const T *); const T *begin() const; + const T *data() const; }; using string_view = basic_string_view; @@ -80,6 +81,7 @@ struct basic_string { const T *c_str() const; operator basic_string_view () const; using const_iterator = iter; + const T *data() const; }; using string = basic_string; diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index 04bb1330ded4c..66a2a19ceb321 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -852,3 +852,27 @@ struct Test { }; } // namespace GH120543 + +namespace GH127195 { +template +struct StatusOr { + T* operator->() [[clang::lifetimebound]]; + T* value() [[clang::lifetimebound]]; +}; + +const char* foo() { + StatusOr s; + return s->data(); // expected-warning {{address of stack memory associated with local variable}} + + StatusOr s2; + return s2->data(); + + StatusOr> s3; + return s3.value()->value()->data(); + + // FIXME: nested cases are not supported now. + StatusOr> s4; + return s4.value()->value()->data(); +} + +} // namespace GH127195 From 949e4041c9927a68a39bf42c71bc73728919505f Mon Sep 17 00:00:00 2001 From: Un1q32 Date: Mon, 17 Feb 2025 08:55:14 -0500 Subject: [PATCH 018/127] [libc++] Add watchOS and tvOS checks for aligned_alloc (#126862) Adds the equivalent watchOS and tvOS version checks to check for support for aligned_alloc, we already have macOS and iOS checks. --- libcxx/include/__config | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index c16552228dbb1..53900e40655ef 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -663,7 +663,10 @@ typedef __char32_t char32_t; # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && \ __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) || \ (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && \ - __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) + __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) || \ + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && \ + __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000) || \ + (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) # define _LIBCPP_HAS_C11_ALIGNED_ALLOC 0 # else # define _LIBCPP_HAS_C11_ALIGNED_ALLOC 1 From d25becaa2079b19e475902ca712cad5df3e660ee Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Mon, 17 Feb 2025 14:53:07 +0100 Subject: [PATCH 019/127] [MLIR][Doc] Update the pass infra doc to advise against `let constructor = ` (NFC) We should avoid specifying it manually and instead rely on TableGen, see also cleanups in #127403 --- mlir/docs/PassManagement.md | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md index 9fb0aaab06461..eda48a44cf023 100644 --- a/mlir/docs/PassManagement.md +++ b/mlir/docs/PassManagement.md @@ -809,11 +809,6 @@ def MyPass : Pass<"my-pass", "ModuleOp"> { its various constraints and behavior. }]; - // A constructor must be provided to specify how to create a default instance - // of MyPass. It can be skipped for this specific example, because both the - // constructor and the registration methods live in the same namespace. - let constructor = "foo::createMyPass()"; - // Specify any options. let options = [ Option<"option", "example-option", "bool", /*default=*/"true", @@ -883,8 +878,7 @@ struct MyPassOptions { #endif // GEN_PASS_DECL_MYPASS ``` -If the `constructor` field has not been specified in the tablegen declaration, -then autogenerated file will also contain the declarations of the default +The utogenerated file will also contain the declarations of the default constructors. ```c++ @@ -927,9 +921,8 @@ struct MyPass : foo::impl::MyPassBase { These definitions can be enabled on a per-pass basis by defining the appropriate preprocessor `GEN_PASS_DEF_PASSNAME` macro, with `PASSNAME` equal to the uppercase version of the name of the pass definition in tablegen. -If the `constructor` field has not been specified in tablegen, then the default -constructors are also defined and expect the name of the actual pass class to -be equal to the name defined in tablegen. +The default constructors are also defined and expect the name of the actual pass +class to be equal to the name defined in tablegen. Using the `gen-pass-doc` generator, markdown documentation for each of the passes can be generated. See [Passes.md](Passes.md) for example output of real @@ -951,12 +944,14 @@ contains the following fields: * `dependentDialects` - A list of strings representing the `Dialect` classes this pass may introduce entities, Attributes/Operations/Types/etc., of. -* `constructor` - - A code block used to create a default instance of the pass. * `options` - A list of pass options used by the pass. * `statistics` - A list of pass statistics used by the pass. +* `constructor` + - A code block used to create a default instance of the pass. + Specifying it will disable the constructors auto-generation for the + pass. This is a legacy option, it is not advised to use it. #### Options From 18ea6c928088cf9ad2a990bfcca546c608825a7f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 17 Feb 2025 21:03:50 +0700 Subject: [PATCH 020/127] AMDGPU: Stop emitting an error on illegal addrspacecasts (#127487) These cannot be static compile errors, and should be treated as poison. Invalid casts may be introduced which are dynamically dead. For example: ``` void foo(volatile generic int* x) { __builtin_assume(is_shared(x)); *x = 4; } void bar() { private int y; foo(&y); // violation, wrong address space } ``` This could produce a compile time backend error or not depending on the optimization level. Similarly, the new test demonstrates a failure on a lowered atomicrmw which required inserting runtime address space checks. The invalid cases are dynamically dead, we should not error, and the AtomicExpand pass shouldn't have to consider the details of the incoming pointer to produce valid IR. This should go to the release branch. This fixes broken -O0 compiles with 64-bit atomics which would have started failing in 1d0370872f28ec9965448f33db1b105addaf64ae. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 7 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +- llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 646 ++++++++++++++++++ .../CodeGen/AMDGPU/invalid-addrspacecast.ll | 44 +- 4 files changed, 687 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 908d323c7fec9..649deee346e90 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2426,11 +2426,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( return true; } - DiagnosticInfoUnsupported InvalidAddrSpaceCast( - MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); - - LLVMContext &Ctx = MF.getFunction().getContext(); - Ctx.diagnose(InvalidAddrSpaceCast); + // Invalid casts are poison. + // TODO: Should return poison B.buildUndef(Dst); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 62ee196cf8e17..e09b310d107ac 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7341,11 +7341,8 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, // global <-> flat are no-ops and never emitted. - const MachineFunction &MF = DAG.getMachineFunction(); - DiagnosticInfoUnsupported InvalidAddrSpaceCast( - MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); - DAG.getContext()->diagnose(InvalidAddrSpaceCast); - + // Invalid casts are poison. + // TODO: Should return poison return DAG.getUNDEF(Op->getValueType(0)); } diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 5f56568ef88e4..afcd9b5fcdc7e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -444,6 +444,652 @@ define float @no_unsafe(ptr %addr, float %val) { ret float %res } +@global = hidden addrspace(1) global i64 0, align 8 + +; Make sure there is no error on an invalid addrspacecast without optimizations +define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { +; GFX908-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: s_mov_b32 s6, 32 +; GFX908-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX908-NEXT: s_getpc_b64 s[6:7] +; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX908-NEXT: s_cmp_eq_u32 s7, s4 +; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX908-NEXT: s_mov_b64 s[4:5], -1 +; GFX908-NEXT: s_mov_b32 s6, 1 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX908-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX908-NEXT: .LBB4_1: ; %Flow +; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX908-NEXT: s_mov_b32 s4, 1 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[4:5], v2, s4 +; GFX908-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX908-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_co_u32_e64 v0, s[4:5], v3, v0 +; GFX908-NEXT: v_addc_co_u32_e64 v1, s[4:5], v4, v1, s[4:5] +; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX908-NEXT: s_branch .LBB4_4 +; GFX908-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX908-NEXT: s_getpc_b64 s[4:5] +; GFX908-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: v_mov_b32_e32 v3, s5 +; GFX908-NEXT: flat_atomic_add_x2 v[3:4], v[2:3], v[0:1] glc +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_branch .LBB4_1 +; GFX908-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX908-NEXT: s_mov_b32 s4, 32 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_lshrrev_b64 v[1:2], s4, v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b32 s6, 32 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX90A-NEXT: s_getpc_b64 s[6:7] +; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX90A-NEXT: s_cmp_eq_u32 s7, s4 +; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], -1 +; GFX90A-NEXT: s_mov_b32 s6, 1 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX90A-NEXT: .LBB4_1: ; %Flow +; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX90A-NEXT: s_mov_b32 s4, 1 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[4:5], v2, v0 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, s[4:5], v3, v1, s[4:5] +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_branch .LBB4_4 +; GFX90A-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_branch .LBB4_1 +; GFX90A-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX90A-NEXT: s_mov_b32 s4, 32 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshrrev_b64 v[4:5], s4, v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b32 s2, 32 +; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX942-NEXT: s_getpc_b64 s[2:3] +; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX942-NEXT: s_cmp_eq_u32 s3, s0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], -1 +; GFX942-NEXT: s_mov_b32 s2, 1 +; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2 +; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX942-NEXT: .LBB4_1: ; %Flow +; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 1 +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0 +; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX942-NEXT: s_branch .LBB4_4 +; GFX942-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_branch .LBB4_1 +; GFX942-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX942-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX942-NEXT: s_mov_b32 s0, 32 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b64 v[4:5], s0, v[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1100-NEXT: s_mov_b32 s2, 32 +; GFX1100-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX1100-NEXT: s_getpc_b64 s[2:3] +; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX1100-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-NEXT: s_mov_b32 s0, -1 +; GFX1100-NEXT: s_mov_b32 s1, 1 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1100-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX1100-NEXT: .LBB4_1: ; %Flow +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-NEXT: s_mov_b32 s0, 1 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, v2, s0 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: scratch_load_b64 v[3:4], off, s0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: v_add_co_u32 v0, s0, v3, v0 +; GFX1100-NEXT: v_add_co_ci_u32_e64 v1, s0, v4, v1, s0 +; GFX1100-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX1100-NEXT: s_branch .LBB4_4 +; GFX1100-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX1100-NEXT: s_getpc_b64 s[0:1] +; GFX1100-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 +; GFX1100-NEXT: v_mov_b32_e32 v3, s1 +; GFX1100-NEXT: v_mov_b32_e32 v2, s0 +; GFX1100-NEXT: flat_atomic_add_u64 v[3:4], v[2:3], v[0:1] glc +; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_branch .LBB4_1 +; GFX1100-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX1100-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX1100-NEXT: s_mov_b32 s0, 32 +; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_lshrrev_b64 v[1:2], s0, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1200-NEXT: s_mov_b32 s2, 32 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX1200-NEXT: s_getpc_b64 s[2:3] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s3, s3 +; GFX1200-NEXT: s_add_co_u32 s2, s2, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1200-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1200-NEXT: s_mov_b32 s0, -1 +; GFX1200-NEXT: s_mov_b32 s1, 1 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX1200-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX1200-NEXT: .LBB4_1: ; %Flow +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1200-NEXT: s_mov_b32 s0, 1 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cmp_ne_u32_e64 s0, v2, s0 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: scratch_load_b64 v[3:4], off, s0 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_add_co_u32 v0, s0, v3, v0 +; GFX1200-NEXT: s_wait_alu 0xf1ff +; GFX1200-NEXT: v_add_co_ci_u32_e64 v1, s0, v4, v1, s0 +; GFX1200-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX1200-NEXT: s_branch .LBB4_4 +; GFX1200-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX1200-NEXT: s_getpc_b64 s[0:1] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s1, s1 +; GFX1200-NEXT: s_add_co_u32 s0, s0, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s1, s1, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_mov_b32_e32 v3, s1 +; GFX1200-NEXT: v_mov_b32_e32 v2, s0 +; GFX1200-NEXT: flat_atomic_add_u64 v[3:4], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1200-NEXT: s_mov_b32 s0, 0 +; GFX1200-NEXT: s_branch .LBB4_1 +; GFX1200-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX1200-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX1200-NEXT: s_mov_b32 s0, 32 +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_alu 0xf1fe +; GFX1200-NEXT: v_lshrrev_b64 v[1:2], s0, v[3:4] +; GFX1200-NEXT: v_mov_b32_e32 v0, v3 +; GFX1200-NEXT: s_setpc_b64 s[30:31] + %rmw = atomicrmw add ptr addrspacecast (ptr addrspace(1) @global to ptr), i64 %val syncscope("agent") monotonic, align 8 + ret i64 %rmw +} + +; Make sure there is no error on an invalid addrspacecast without optimizations +define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { +; GFX908-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: s_mov_b32 s6, 32 +; GFX908-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX908-NEXT: s_getpc_b64 s[6:7] +; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX908-NEXT: s_cmp_eq_u32 s7, s4 +; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX908-NEXT: s_mov_b64 s[4:5], -1 +; GFX908-NEXT: s_mov_b32 s6, 1 +; GFX908-NEXT: v_readfirstlane_b32 s7, v2 +; GFX908-NEXT: s_cmp_lg_u32 s7, s6 +; GFX908-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX908-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX908-NEXT: s_branch .LBB5_3 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX908-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] +; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX908-NEXT: s_branch .LBB5_6 +; GFX908-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX908-NEXT: s_getpc_b64 s[4:5] +; GFX908-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: v_mov_b32_e32 v3, s5 +; GFX908-NEXT: flat_load_dwordx2 v[3:4], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_branch .LBB5_4 +; GFX908-NEXT: .LBB5_3: ; %Flow +; GFX908-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX908-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX908-NEXT: s_branch .LBB5_6 +; GFX908-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_add_f64 v[3:4], v[5:6], v[0:1] +; GFX908-NEXT: s_getpc_b64 s[6:7] +; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX908-NEXT: v_mov_b32_e32 v8, s7 +; GFX908-NEXT: v_mov_b32_e32 v7, s6 +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e64 s[6:7], v[3:4], v[5:6] +; GFX908-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_4 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_branch .LBB5_3 +; GFX908-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX908-NEXT: s_mov_b32 s4, 32 +; GFX908-NEXT: v_lshrrev_b64 v[1:2], s4, v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b32 s6, 32 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX90A-NEXT: s_getpc_b64 s[6:7] +; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX90A-NEXT: s_cmp_eq_u32 s7, s4 +; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], -1 +; GFX90A-NEXT: s_mov_b32 s6, 1 +; GFX90A-NEXT: v_readfirstlane_b32 s7, v2 +; GFX90A-NEXT: s_cmp_lg_u32 s7, s6 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX90A-NEXT: s_branch .LBB5_3 +; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_branch .LBB5_6 +; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_branch .LBB5_4 +; GFX90A-NEXT: .LBB5_3: ; %Flow +; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX90A-NEXT: s_branch .LBB5_6 +; GFX90A-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX90A-NEXT: s_getpc_b64 s[6:7] +; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB5_4 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_branch .LBB5_3 +; GFX90A-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX90A-NEXT: s_mov_b32 s4, 32 +; GFX90A-NEXT: v_lshrrev_b64 v[4:5], s4, v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b32 s2, 32 +; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX942-NEXT: s_getpc_b64 s[2:3] +; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX942-NEXT: s_cmp_eq_u32 s3, s0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], -1 +; GFX942-NEXT: s_mov_b32 s2, 1 +; GFX942-NEXT: v_readfirstlane_b32 s3, v2 +; GFX942-NEXT: s_cmp_lg_u32 s3, s2 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX942-NEXT: s_branch .LBB5_3 +; GFX942-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; GFX942-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX942-NEXT: s_branch .LBB5_6 +; GFX942-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_branch .LBB5_4 +; GFX942-NEXT: .LBB5_3: ; %Flow +; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX942-NEXT: s_branch .LBB5_6 +; GFX942-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX942-NEXT: s_getpc_b64 s[2:3] +; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[2:3] +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5] +; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB5_4 +; GFX942-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_branch .LBB5_3 +; GFX942-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX942-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX942-NEXT: s_mov_b32 s0, 32 +; GFX942-NEXT: v_lshrrev_b64 v[4:5], s0, v[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1100-NEXT: s_mov_b32 s2, 32 +; GFX1100-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX1100-NEXT: s_getpc_b64 s[2:3] +; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX1100-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-NEXT: s_mov_b32 s0, -1 +; GFX1100-NEXT: s_mov_b32 s1, 1 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1100-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX1100-NEXT: s_branch .LBB5_3 +; GFX1100-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX1100-NEXT: scratch_load_b64 v[3:4], off, s0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] +; GFX1100-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX1100-NEXT: s_branch .LBB5_6 +; GFX1100-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX1100-NEXT: s_getpc_b64 s[0:1] +; GFX1100-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 +; GFX1100-NEXT: v_mov_b32_e32 v3, s1 +; GFX1100-NEXT: v_mov_b32_e32 v2, s0 +; GFX1100-NEXT: flat_load_b64 v[3:4], v[2:3] +; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_branch .LBB5_4 +; GFX1100-NEXT: .LBB5_3: ; %Flow +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX1100-NEXT: s_branch .LBB5_6 +; GFX1100-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_mov_b32_e32 v6, v4 +; GFX1100-NEXT: v_mov_b32_e32 v5, v3 +; GFX1100-NEXT: v_add_f64 v[3:4], v[5:6], v[0:1] +; GFX1100-NEXT: s_getpc_b64 s[2:3] +; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX1100-NEXT: v_mov_b32_e32 v8, s3 +; GFX1100-NEXT: v_mov_b32_e32 v7, s2 +; GFX1100-NEXT: flat_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6] glc +; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_cmp_eq_u64_e64 s1, v[3:4], v[5:6] +; GFX1100-NEXT: s_or_b32 s0, s1, s0 +; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1100-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1100-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_branch .LBB5_3 +; GFX1100-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX1100-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX1100-NEXT: s_mov_b32 s0, 32 +; GFX1100-NEXT: v_lshrrev_b64 v[1:2], s0, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1200-NEXT: s_mov_b32 s2, 32 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX1200-NEXT: s_getpc_b64 s[2:3] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s3, s3 +; GFX1200-NEXT: s_add_co_u32 s2, s2, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1200-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1200-NEXT: s_mov_b32 s0, -1 +; GFX1200-NEXT: s_mov_b32 s1, 1 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX1200-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX1200-NEXT: s_branch .LBB5_3 +; GFX1200-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX1200-NEXT: scratch_load_b64 v[3:4], off, s0 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_add_f64_e64 v[0:1], v[3:4], v[0:1] +; GFX1200-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX1200-NEXT: s_branch .LBB5_6 +; GFX1200-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX1200-NEXT: s_getpc_b64 s[0:1] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s1, s1 +; GFX1200-NEXT: s_add_co_u32 s0, s0, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s1, s1, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_mov_b32_e32 v3, s1 +; GFX1200-NEXT: v_mov_b32_e32 v2, s0 +; GFX1200-NEXT: flat_load_b64 v[3:4], v[2:3] +; GFX1200-NEXT: s_mov_b32 s0, 0 +; GFX1200-NEXT: s_branch .LBB5_4 +; GFX1200-NEXT: .LBB5_3: ; %Flow +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX1200-NEXT: s_branch .LBB5_6 +; GFX1200-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: v_mov_b32_e32 v6, v4 +; GFX1200-NEXT: v_mov_b32_e32 v5, v3 +; GFX1200-NEXT: v_add_f64_e64 v[3:4], v[5:6], v[0:1] +; GFX1200-NEXT: s_getpc_b64 s[2:3] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s3, s3 +; GFX1200-NEXT: s_add_co_u32 s2, s2, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_mov_b32_e32 v8, s3 +; GFX1200-NEXT: v_mov_b32_e32 v7, s2 +; GFX1200-NEXT: flat_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: v_cmp_eq_u64_e64 s1, v[3:4], v[5:6] +; GFX1200-NEXT: s_or_b32 s0, s1, s0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1200-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: s_mov_b32 s0, 0 +; GFX1200-NEXT: s_branch .LBB5_3 +; GFX1200-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX1200-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX1200-NEXT: s_mov_b32 s0, 32 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_lshrrev_b64 v[1:2], s0, v[3:4] +; GFX1200-NEXT: v_mov_b32_e32 v0, v3 +; GFX1200-NEXT: s_setpc_b64 s[30:31] + %rmw = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @global to ptr), double %val monotonic, align 8 + ret double %rmw +} + attributes #0 = { nounwind } +attributes #1 = { noinline nounwind optnone } !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll index e1ba6489a5317..f0609f62a9024 100644 --- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll @@ -1,36 +1,66 @@ -; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsad < %s | FileCheck %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s + +; Check illegal casts are codegened as poison, and not an error. -; ERROR: error: :0:0: in function use_group_to_global_addrspacecast void (ptr addrspace(3)): invalid addrspacecast define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) { +; CHECK-LABEL: use_group_to_global_addrspacecast: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: flat_store_dword v[0:1], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_endpgm %stof = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(1) store volatile i32 0, ptr addrspace(1) %stof ret void } -; ERROR: error: :0:0: in function use_local_to_constant32bit_addrspacecast void (ptr addrspace(3)): invalid addrspacecast define amdgpu_kernel void @use_local_to_constant32bit_addrspacecast(ptr addrspace(3) %ptr) { +; CHECK-LABEL: use_local_to_constant32bit_addrspacecast: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_load_dword s0, s[0:1], 0x0 +; CHECK-NEXT: s_endpgm %stof = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(6) %load = load volatile i32, ptr addrspace(6) %stof ret void } -; ERROR: error: :0:0: in function use_constant32bit_to_local_addrspacecast void (ptr addrspace(6)): invalid addrspacecast define amdgpu_kernel void @use_constant32bit_to_local_addrspacecast(ptr addrspace(6) %ptr) { +; CHECK-LABEL: use_constant32bit_to_local_addrspacecast: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: s_endpgm %cast = addrspacecast ptr addrspace(6) %ptr to ptr addrspace(3) %load = load volatile i32, ptr addrspace(3) %cast ret void } -; ERROR: error: :0:0: in function use_local_to_42_addrspacecast void (ptr addrspace(3)): invalid addrspacecast define amdgpu_kernel void @use_local_to_42_addrspacecast(ptr addrspace(3) %ptr) { +; SDAG-LABEL: use_local_to_42_addrspacecast: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: use_local_to_42_addrspacecast: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_endpgm %cast = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(42) store volatile ptr addrspace(42) %cast, ptr addrspace(1) null ret void } -; ERROR: error: :0:0: in function use_42_to_local_addrspacecast void (ptr addrspace(42)): invalid addrspacecast define amdgpu_kernel void @use_42_to_local_addrspacecast(ptr addrspace(42) %ptr) { +; CHECK-LABEL: use_42_to_local_addrspacecast: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: s_endpgm %cast = addrspacecast ptr addrspace(42) %ptr to ptr addrspace(3) %load = load volatile i32, ptr addrspace(3) %cast ret void From 4e41e9ac4c6fd1cb7f3aa3a42725727aff5aadd7 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Mon, 17 Feb 2025 22:11:49 +0800 Subject: [PATCH 021/127] [mlir] Update docs for Greedy Pattern Rewrite Driver(NFC) (#126701) The `applyOpPatternsAndFold` is deprecated, use `applyOpPatternsGreedily` instead. --- mlir/docs/PatternRewriter.md | 2 +- mlir/lib/Reducer/ReductionTreePass.cpp | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md index d15e7e5a80678..9df4647299010 100644 --- a/mlir/docs/PatternRewriter.md +++ b/mlir/docs/PatternRewriter.md @@ -361,7 +361,7 @@ This driver comes in two fashions: * `applyPatternsGreedily` ("region-based driver") applies patterns to all ops in a given region or a given container op (but not the container op itself). I.e., the worklist is initialized with all containing ops. -* `applyOpPatternsAndFold` ("op-based driver") applies patterns to the +* `applyOpPatternsGreedily` ("op-based driver") applies patterns to the provided list of operations. I.e., the worklist is initialized with the specified list of ops. diff --git a/mlir/lib/Reducer/ReductionTreePass.cpp b/mlir/lib/Reducer/ReductionTreePass.cpp index 2d2744bfc2732..ef32adbab5577 100644 --- a/mlir/lib/Reducer/ReductionTreePass.cpp +++ b/mlir/lib/Reducer/ReductionTreePass.cpp @@ -56,13 +56,14 @@ static void applyPatterns(Region ®ion, opsInRange.push_back(&op.value()); } - // `applyOpPatternsAndFold` may erase the ops so we can't do the pattern - // matching in above iteration. Besides, erase op not-in-range may end up in - // invalid module, so `applyOpPatternsAndFold` should come before that - // transform. + // `applyOpPatternsGreedily` with folding may erase the ops so we can't do the + // pattern matching in above iteration. Besides, erase op not-in-range may end + // up in invalid module, so `applyOpPatternsGreedily` with folding should come + // before that transform. for (Operation *op : opsInRange) { - // `applyOpPatternsAndFold` returns whether the op is convered. Omit it - // because we don't have expectation this reduction will be success or not. + // `applyOpPatternsGreedily` with folding returns whether the op is + // convered. Omit it because we don't have expectation this reduction will + // be success or not. GreedyRewriteConfig config; config.strictMode = GreedyRewriteStrictness::ExistingOps; (void)applyOpPatternsGreedily(op, patterns, config); From 919e72f2513d57fc2105f6e3477c13eb1f0c6cba Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Mon, 17 Feb 2025 15:18:27 +0100 Subject: [PATCH 022/127] [flang][OpenMP] Support `bind` clause for `teams loop` (#127021) Extends generic `loop` directive support by supporting the `bind` clause. Since semantic checking does the heavy lifting of verifying the proper usage of the clause modifier, we can simply enable code-gen for `teams loop bind(...)` without the need to differentiate between the values the the clause can accept. --- .../OpenMP/GenericLoopConversion.cpp | 7 ++++--- .../Lower/OpenMP/generic-loop-rewriting.f90 | 20 ++++++++++++++++++- .../generic-loop-rewriting-todo.mlir | 16 --------------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp index 3512a537d38c3..d2581e3ad0a0a 100644 --- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp @@ -84,9 +84,10 @@ class GenericLoopConversionPattern << loopOp->getName() << " operation"; }; - // For standalone directives, `bind` is already supported. Other combined - // forms will be supported in a follow-up PR. - if (combinedInfo != GenericLoopCombinedInfo::Standalone && + // For `loop` and `teams loop` directives, `bind` is supported. + // Additionally, for `teams loop`, semantic checking verifies that the + // `bind` clause modifier is `teams`, so no need to check this here again. + if (combinedInfo == GenericLoopCombinedInfo::ParallelLoop && loopOp.getBindKind()) return todo("bind"); diff --git a/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 b/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 index fa26425356dd9..0699c36c69519 100644 --- a/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 +++ b/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 @@ -1,5 +1,12 @@ -!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s +!RUN: split-file %s %t +!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/no_bind_clause.f90 -o - \ +!RUN: | FileCheck %s + +!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/bind_clause_teams.f90 -o - \ +!RUN: | FileCheck %s + +!--- no_bind_clause.f90 subroutine target_teams_loop implicit none integer :: x, i @@ -10,6 +17,17 @@ subroutine target_teams_loop end do end subroutine target_teams_loop +!--- bind_clause_teams.f90 +subroutine target_teams_loop + implicit none + integer :: x, i + + !$omp target teams loop bind(teams) + do i = 0, 10 + x = x + i + end do +end subroutine target_teams_loop + !CHECK-LABEL: func.func @_QPtarget_teams_loop !CHECK: omp.target map_entries( !CHECK-SAME: %{{.*}} -> %[[I_ARG:[^[:space:]]+]], diff --git a/flang/test/Transforms/generic-loop-rewriting-todo.mlir b/flang/test/Transforms/generic-loop-rewriting-todo.mlir index cbde981c4c49d..25baffe34e394 100644 --- a/flang/test/Transforms/generic-loop-rewriting-todo.mlir +++ b/flang/test/Transforms/generic-loop-rewriting-todo.mlir @@ -16,22 +16,6 @@ func.func @_QPparallel_loop() { return } -func.func @_QPloop_bind() { - omp.teams { - %c0 = arith.constant 0 : i32 - %c10 = arith.constant 10 : i32 - %c1 = arith.constant 1 : i32 - // expected-error@below {{not yet implemented: Unhandled clause bind in omp.loop operation}} - omp.loop bind(thread) { - omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) { - omp.yield - } - } - omp.terminator - } - return -} - omp.declare_reduction @add_reduction_i32 : i32 init { ^bb0(%arg0: i32): %c0_i32 = arith.constant 0 : i32 From 9d487050a144b895950a6fd48b993513a714e69d Mon Sep 17 00:00:00 2001 From: flovent <144676429+flovent@users.noreply.github.com> Date: Mon, 17 Feb 2025 22:35:40 +0800 Subject: [PATCH 023/127] [clang][analyzer] Teach the BlockInCriticalSectionChecker about O_NONBLOCK streams (#127049) this PR close #124474 when calling `read` and `recv` function for a non-block file descriptor or a invalid file descriptor(`-1`), it will not cause block inside a critical section. this commit checks for non-block file descriptor assigned by `open` function with `O_NONBLOCK` flag. --------- Co-authored-by: Balazs Benics --- .../BlockInCriticalSectionChecker.cpp | 76 +++++++++++++++++++ .../system-header-simulator-cxx-std-locks.h | 13 ++++ clang/test/Analysis/issue-124474.cpp | 37 +++++++++ 3 files changed, 126 insertions(+) create mode 100644 clang/test/Analysis/Inputs/system-header-simulator-cxx-std-locks.h create mode 100644 clang/test/Analysis/issue-124474.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp index 7460781799d08..bf35bee70870b 100644 --- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp @@ -145,6 +145,57 @@ using MutexDescriptor = std::variant; +class SuppressNonBlockingStreams : public BugReporterVisitor { +private: + const CallDescription OpenFunction{CDM::CLibrary, {"open"}, 2}; + SymbolRef StreamSym; + const int NonBlockMacroVal; + bool Satisfied = false; + +public: + SuppressNonBlockingStreams(SymbolRef StreamSym, int NonBlockMacroVal) + : StreamSym(StreamSym), NonBlockMacroVal(NonBlockMacroVal) {} + + static void *getTag() { + static bool Tag; + return &Tag; + } + + void Profile(llvm::FoldingSetNodeID &ID) const override { + ID.AddPointer(getTag()); + } + + PathDiagnosticPieceRef VisitNode(const ExplodedNode *N, + BugReporterContext &BRC, + PathSensitiveBugReport &BR) override { + if (Satisfied) + return nullptr; + + std::optional Point = N->getLocationAs(); + if (!Point) + return nullptr; + + const auto *CE = Point->getStmtAs(); + if (!CE || !OpenFunction.matchesAsWritten(*CE)) + return nullptr; + + if (N->getSVal(CE).getAsSymbol() != StreamSym) + return nullptr; + + Satisfied = true; + + // Check if open's second argument contains O_NONBLOCK + const llvm::APSInt *FlagVal = N->getSVal(CE->getArg(1)).getAsInteger(); + if (!FlagVal) + return nullptr; + + if ((*FlagVal & NonBlockMacroVal) != 0) + BR.markInvalid(getTag(), nullptr); + + return nullptr; + } +}; + class BlockInCriticalSectionChecker : public Checker { private: const std::array MutexDescriptors{ @@ -182,6 +233,9 @@ class BlockInCriticalSectionChecker : public Checker { const BugType BlockInCritSectionBugType{ this, "Call to blocking function in critical section", "Blocking Error"}; + using O_NONBLOCKValueTy = std::optional; + mutable std::optional O_NONBLOCKValue; + void reportBlockInCritSection(const CallEvent &call, CheckerContext &C) const; [[nodiscard]] const NoteTag *createCritSectionNote(CritSectionMarker M, @@ -337,6 +391,28 @@ void BlockInCriticalSectionChecker::reportBlockInCritSection( << "' inside of critical section"; auto R = std::make_unique(BlockInCritSectionBugType, os.str(), ErrNode); + // for 'read' and 'recv' call, check whether it's file descriptor(first + // argument) is + // created by 'open' API with O_NONBLOCK flag or is equal to -1, they will + // not cause block in these situations, don't report + StringRef FuncName = Call.getCalleeIdentifier()->getName(); + if (FuncName == "read" || FuncName == "recv") { + SVal SV = Call.getArgSVal(0); + SValBuilder &SVB = C.getSValBuilder(); + ProgramStateRef state = C.getState(); + ConditionTruthVal CTV = + state->areEqual(SV, SVB.makeIntVal(-1, C.getASTContext().IntTy)); + if (CTV.isConstrainedTrue()) + return; + + if (SymbolRef SR = SV.getAsSymbol()) { + if (!O_NONBLOCKValue) + O_NONBLOCKValue = tryExpandAsInteger( + "O_NONBLOCK", C.getBugReporter().getPreprocessor()); + if (*O_NONBLOCKValue) + R->addVisitor(SR, **O_NONBLOCKValue); + } + } R->addRange(Call.getSourceRange()); R->markInteresting(Call.getReturnValue()); C.emitReport(std::move(R)); diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx-std-locks.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx-std-locks.h new file mode 100644 index 0000000000000..054dd5405e1be --- /dev/null +++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx-std-locks.h @@ -0,0 +1,13 @@ +#pragma clang system_header + +namespace std { +struct mutex { + void lock(); + void unlock(); +}; + +template struct lock_guard { + lock_guard(std::mutex &); + ~lock_guard(); +}; +} // namespace std diff --git a/clang/test/Analysis/issue-124474.cpp b/clang/test/Analysis/issue-124474.cpp new file mode 100644 index 0000000000000..ae30c4db552c1 --- /dev/null +++ b/clang/test/Analysis/issue-124474.cpp @@ -0,0 +1,37 @@ +// RUN: %clang_analyze_cc1 \ +// RUN: -analyzer-checker=core,unix.BlockInCriticalSection \ +// RUN: -analyzer-output text -verify %s + +// expected-no-diagnostics + +#include "Inputs/system-header-simulator-cxx-std-locks.h" + +std::mutex mtx; +using ssize_t = long long; +using size_t = unsigned long long; +int open(const char *__file, int __oflag, ...); +ssize_t read(int fd, void *buf, size_t count); +void close(int fd); +#define O_RDONLY 00 +#define O_NONBLOCK 04000 + +void foo() { + std::lock_guard lock(mtx); + + const char *filename = "example.txt"; + int fd = open(filename, O_RDONLY | O_NONBLOCK); + + char buffer[200] = {}; + read(fd, buffer, 199); // no-warning: fd is a non-block file descriptor or equals to -1 + close(fd); +} + +void foo1(int fd) { + std::lock_guard lock(mtx); + + const char *filename = "example.txt"; + char buffer[200] = {}; + if (fd == -1) + read(fd, buffer, 199); // no-warning: consider file descriptor is a symbol equals to -1 + close(fd); +} From ab2d330feab3e1d9927a3c0de1a9d6e9bda5abe9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 17 Feb 2025 22:11:26 +0700 Subject: [PATCH 024/127] TableGen: Generate reverseComposeSubRegIndices (#127050) This is necessary to enable composing subregisters in peephole-opt. For now use a brute force table to find the return value. The worst case target is AMDGPU with a 399 x 399 entry table. --- .../include/llvm/CodeGen/TargetRegisterInfo.h | 21 +++++ .../Target/AMDGPU/AMDGPUUnitTests.cpp | 80 +++++++++++++++++++ llvm/utils/TableGen/RegisterInfoEmitter.cpp | 54 ++++++++++++- 3 files changed, 152 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 114149ff53d85..66fd3fb9b0526 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -732,6 +732,22 @@ class TargetRegisterInfo : public MCRegisterInfo { return composeSubRegIndicesImpl(a, b); } + /// Return a subregister index that will compose to give you the subregister + /// index. + /// + /// Finds a subregister index x such that composeSubRegIndices(a, x) == + /// b. Note that this relationship does not hold if + /// reverseComposeSubRegIndices returns the null subregister. + /// + /// The special null sub-register index composes as the identity. + unsigned reverseComposeSubRegIndices(unsigned a, unsigned b) const { + if (!a) + return b; + if (!b) + return a; + return reverseComposeSubRegIndicesImpl(a, b); + } + /// Transforms a LaneMask computed for one subregister to the lanemask that /// would have been computed when composing the subsubregisters with IdxA /// first. @sa composeSubRegIndices() @@ -774,6 +790,11 @@ class TargetRegisterInfo : public MCRegisterInfo { llvm_unreachable("Target has no sub-registers"); } + /// Overridden by TableGen in targets that have sub-registers. + virtual unsigned reverseComposeSubRegIndicesImpl(unsigned, unsigned) const { + llvm_unreachable("Target has no sub-registers"); + } + /// Overridden by TableGen in targets that have sub-registers. virtual LaneBitmask composeSubRegIndexLaneMaskImpl(unsigned, LaneBitmask) const { diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp index d0a3cfa84ee01..8fbd470815b79 100644 --- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp @@ -164,3 +164,83 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) { testGPRLimits("VGPR", true, test); } + +static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) { + return SubReg ? TRI.getSubRegIndexName(SubReg) : ""; +} + +TEST(AMDGPU, TestReverseComposeSubRegIndices) { + auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx900", ""); + if (!TM) + return; + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + +#define EXPECT_SUBREG_EQ(A, B, Expect) \ + do { \ + unsigned Reversed = TRI->reverseComposeSubRegIndices(A, B); \ + EXPECT_EQ(Reversed, Expect) \ + << printSubReg(*TRI, A) << ", " << printSubReg(*TRI, B) << " => " \ + << printSubReg(*TRI, Reversed) << ", *" << printSubReg(*TRI, Expect); \ + } while (0); + + EXPECT_SUBREG_EQ(AMDGPU::NoSubRegister, AMDGPU::sub0, AMDGPU::sub0); + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::NoSubRegister, AMDGPU::sub0); + + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub0, AMDGPU::sub0); + + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub1); + EXPECT_SUBREG_EQ(AMDGPU::sub1, AMDGPU::sub0, AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1, AMDGPU::sub0, AMDGPU::sub0); + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub0_sub1, + AMDGPU::sub0_sub1); + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub0_sub1_sub2_sub3); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2, + AMDGPU::sub1_sub2); + EXPECT_SUBREG_EQ(AMDGPU::sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3, + AMDGPU::sub1_sub2_sub3); + + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub30, AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub30, AMDGPU::sub0, AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub31, AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub31, AMDGPU::sub0, AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1, AMDGPU::sub30, AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub30, AMDGPU::sub0_sub1, AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1, AMDGPU::sub30_sub31, + AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub30_sub31, AMDGPU::sub0_sub1, + AMDGPU::NoSubRegister); + + for (unsigned SubIdx0 = 1, LastSubReg = TRI->getNumSubRegIndices(); + SubIdx0 != LastSubReg; ++SubIdx0) { + for (unsigned SubIdx1 = 1; SubIdx1 != LastSubReg; ++SubIdx1) { + if (unsigned ForwardCompose = + TRI->composeSubRegIndices(SubIdx0, SubIdx1)) { + unsigned ReverseComposed = + TRI->reverseComposeSubRegIndices(SubIdx0, ForwardCompose); + EXPECT_EQ(ReverseComposed, SubIdx1); + } + + if (unsigned ReverseCompose = + TRI->reverseComposeSubRegIndices(SubIdx0, SubIdx1)) { + unsigned Recompose = TRI->composeSubRegIndices(SubIdx0, ReverseCompose); + EXPECT_EQ(Recompose, SubIdx1); + } + } + } +} diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 2f9ec2e6e7a22..752ebdf01b948 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -680,8 +680,6 @@ static bool combine(const CodeGenSubRegIndex *Idx, void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, StringRef ClassName) { const auto &SubRegIndices = RegBank.getSubRegIndices(); - OS << "unsigned " << ClassName - << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n"; // Many sub-register indexes are composition-compatible, meaning that // @@ -713,7 +711,10 @@ void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, RowMap.push_back(Found); } - // Output the row map if there is multiple rows. + OS << "unsigned " << ClassName + << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n"; + + // Output the row map if there are multiple rows. if (Rows.size() > 1) { OS << " static const " << getMinimalTypeForRange(Rows.size(), 32) << " RowMap[" << SubRegIndicesSize << "] = {\n "; @@ -743,6 +744,51 @@ void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, else OS << " return Rows[0][IdxB];\n"; OS << "}\n\n"; + + // Generate the reverse case. + // + // FIXME: This is the brute force approach. Compress the table similar to the + // forward case. + OS << "unsigned " << ClassName + << "::reverseComposeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const " + "{\n"; + OS << " static const " << getMinimalTypeForRange(SubRegIndicesSize + 1, 32) + << " Table[" << SubRegIndicesSize << "][" << SubRegIndicesSize + << "] = {\n"; + + // Find values where composeSubReg(A, X) == B; + for (const auto &IdxA : SubRegIndices) { + OS << " { "; + + SmallVectorImpl &Row = + Rows[RowMap[IdxA.EnumValue - 1]]; + for (const auto &IdxB : SubRegIndices) { + const CodeGenSubRegIndex *FoundReverse = nullptr; + + for (unsigned i = 0, e = SubRegIndicesSize; i != e; ++i) { + const CodeGenSubRegIndex *This = &SubRegIndices[i]; + const CodeGenSubRegIndex *Composed = Row[i]; + if (Composed == &IdxB) { + if (FoundReverse && FoundReverse != This) // Not unique + break; + FoundReverse = This; + } + } + + if (FoundReverse) { + OS << FoundReverse->getQualifiedName() << ", "; + } else { + OS << "0, "; + } + } + OS << "},\n"; + } + + OS << " };\n\n"; + OS << " --IdxA; assert(IdxA < " << SubRegIndicesSize << ");\n" + << " --IdxB; assert(IdxB < " << SubRegIndicesSize << ");\n"; + OS << " return Table[IdxA][IdxB];\n"; + OS << " }\n\n"; } void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, @@ -1113,6 +1159,8 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) { << " unsigned PC = 0, unsigned HwMode = 0);\n"; if (!RegBank.getSubRegIndices().empty()) { OS << " unsigned composeSubRegIndicesImpl" + << "(unsigned, unsigned) const override;\n" + << " unsigned reverseComposeSubRegIndicesImpl" << "(unsigned, unsigned) const override;\n" << " LaneBitmask composeSubRegIndexLaneMaskImpl" << "(unsigned, LaneBitmask) const override;\n" From fb29f19fdb0b2b3c8c87cc767482d941818e92a8 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 17 Feb 2025 10:16:33 -0500 Subject: [PATCH 025/127] [libc++] Synchronize status pages with Github issues list --- libcxx/docs/Status/Cxx17Issues.csv | 4 ++-- libcxx/docs/Status/Cxx20Papers.csv | 8 ++++---- libcxx/docs/Status/Cxx23Papers.csv | 2 +- libcxx/docs/Status/Cxx2cPapers.csv | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libcxx/docs/Status/Cxx17Issues.csv b/libcxx/docs/Status/Cxx17Issues.csv index e6a232980cf7c..477f3d363a4e2 100644 --- a/libcxx/docs/Status/Cxx17Issues.csv +++ b/libcxx/docs/Status/Cxx17Issues.csv @@ -158,14 +158,14 @@ "`LWG2683 `__","filesystem::copy() says ""no effects""","2016-06 (Oulu)","|Complete|","","" "`LWG2684 `__","priority_queue lacking comparator typedef","2016-06 (Oulu)","|Complete|","","" "`LWG2685 `__","shared_ptr deleters must not throw on move construction","2016-06 (Oulu)","|Complete|","","" -"`LWG2687 `__","{inclusive,exclusive}_scan misspecified","2016-06 (Oulu)","","","" +"`LWG2687 `__","LWG2687: {inclusive,exclusive}_scan misspecified","2016-06 (Oulu)","|Complete|","","" "`LWG2688 `__","clamp misses preconditions and has extraneous condition on result","2016-06 (Oulu)","|Complete|","","" "`LWG2689 `__","Parallel versions of std::copy and std::move shouldn't be in order","2016-06 (Oulu)","|Nothing To Do|","","" "`LWG2698 `__","Effect of assign() on iterators/pointers/references","2016-06 (Oulu)","|Complete|","","" "`LWG2704 `__","recursive_directory_iterator's members should require '``*this`` is dereferenceable'","2016-06 (Oulu)","|Complete|","","" "`LWG2706 `__","Error reporting for recursive_directory_iterator::pop() is under-specified","2016-06 (Oulu)","|Complete|","","" "`LWG2707 `__","path construction and assignment should have ""string_type&&"" overloads","2016-06 (Oulu)","|Complete|","","" -"`LWG2709 `__","offsetof is unnecessarily imprecise","2016-06 (Oulu)","","","" +"`LWG2709 `__","LWG2709: offsetof is unnecessarily imprecise","2016-06 (Oulu)","|Nothing To Do|","","" "`LWG2710 `__","""Effects: Equivalent to ..."" doesn't count ""Synchronization:"" as determined semantics","2016-06 (Oulu)","|Complete|","","" "`LWG2711 `__","path is convertible from approximately everything under the sun","2016-06 (Oulu)","|Complete|","","" "`LWG2716 `__","Specification of shuffle and sample disallows lvalue URNGs","2016-06 (Oulu)","|Complete|","","" diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv index 524c6d0ac8be0..360b5520260ce 100644 --- a/libcxx/docs/Status/Cxx20Papers.csv +++ b/libcxx/docs/Status/Cxx20Papers.csv @@ -34,7 +34,7 @@ "`P0528R3 `__","The Curious Case of Padding Bits, Featuring Atomic Compare-and-Exchange","2018-06 (Rapperswil)","","","" "`P0542R5 `__","Support for contract based programming in C++","2018-06 (Rapperswil)","|Nothing To Do|","n/a","Pulled at the 2019-07 meeting in Cologne" "`P0556R3 `__","Integral power-of-2 operations","2018-06 (Rapperswil)","|Complete|","9","" -"`P0619R4 `__","Reviewing Deprecated Facilities of C++17 for C++20","2018-06 (Rapperswil)","|Complete|","20","Removed headers are still provided as an extension, but with deprecation warnings" +"`P0619R4 `__","Reviewing Deprecated Facilities of C++17 for C++20","2018-06 (Rapperswil)","|Complete|","20","Removed headers are still provided as an extension, but with deprecation warnings." "`P0646R1 `__","Improving the Return Value of Erase-Like Algorithms","2018-06 (Rapperswil)","|Complete|","10","" "`P0722R3 `__","Efficient sized delete for variable sized classes","2018-06 (Rapperswil)","|Complete|","9","" "`P0758R1 `__","Implicit conversion traits and utility functions","2018-06 (Rapperswil)","|Complete|","","" @@ -43,7 +43,7 @@ "`P0788R3 `__","Standard Library Specification in a Concepts and Contracts World","2018-06 (Rapperswil)","|Nothing To Do|","n/a","Pulled at the 2019-07 meeting in Cologne" "`P0879R0 `__","Constexpr for swap and swap related functions Also resolves LWG issue 2800.","2018-06 (Rapperswil)","|Complete|","13","" "`P0887R1 `__","The identity metafunction","2018-06 (Rapperswil)","|Complete|","8","" -"`P0892R2 `__","explicit(bool)","2018-06 (Rapperswil)","","","" +"`P0892R2 `__","P0892R2: explicit(bool)","2018-06 (Rapperswil)","|Nothing To Do|","","" "`P0898R3 `__","Standard Library Concepts","2018-06 (Rapperswil)","|Complete|","13","" "`P0935R0 `__","Eradicating unnecessarily explicit default constructors from the standard library","2018-06 (Rapperswil)","|Complete|","12","" "`P0941R2 `__","Integrating feature-test macros into the C++ WD","2018-06 (Rapperswil)","|In Progress|","","" @@ -174,7 +174,7 @@ "`P1868R2 `__","width: clarifying units of width and precision in std::format","2020-02 (Prague)","|Complete|","14","" "`P1956R1 `__","On the names of low-level bit manipulation functions","2020-02 (Prague)","|Complete|","12","" "`P1957R2 `__","Converting from ``T*``\ to bool should be considered narrowing (re: US 212)","2020-02 (Prague)","|Complete|","18","" -"`P1963R0 `__","Fixing US 313","2020-02 (Prague)","","","" +"`P1963R0 `__","P1963R0: Fixing US 313","2020-02 (Prague)","|Nothing To Do|","","" "`P1964R2 `__","Wording for boolean-testable","2020-02 (Prague)","|Complete|","13","" "`P1970R2 `__","Consistency for size() functions: Add ranges::ssize","2020-02 (Prague)","|Complete|","15","" "`P1973R1 `__","Rename ""_default_init"" Functions, Rev1","2020-02 (Prague)","|Complete|","16","The feature-test macro was not set until LLVM 20." @@ -184,7 +184,7 @@ "`P1983R0 `__","Wording for GB301, US296, US292, US291, and US283","2020-02 (Prague)","|Complete|","15","" "`P1994R1 `__","elements_view needs its own sentinel","2020-02 (Prague)","|Complete|","16","" "`P2002R1 `__","Defaulted comparison specification cleanups","2020-02 (Prague)","|Complete|","7","" -"`P2045R1 `__","Missing Mandates for the standard library","2020-02 (Prague)","","","" +"`P2045R1 `__","P2045R1: Missing Mandates for the standard library","2020-02 (Prague)","|Nothing To Do|","","" "`P2085R0 `__","Consistent defaulted comparisons","2020-02 (Prague)","","","" "`P2091R0 `__","Issues with range access CPOs","2020-02 (Prague)","|Complete|","15","" "`P2101R0 `__","P2101R0: 'Models' subsumes 'satisfies' (Wording for US298 and US300)","2020-02 (Prague)","|Nothing To Do|","","" diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv index 264c5417a5c28..bfaa63a7c224e 100644 --- a/libcxx/docs/Status/Cxx23Papers.csv +++ b/libcxx/docs/Status/Cxx23Papers.csv @@ -100,7 +100,7 @@ "`P2396R1 `__","Concurrency TS 2 fixes ","2022-11 (Kona)","","","" "`P2505R5 `__","Monadic Functions for ``std::expected``","2022-11 (Kona)","|Complete|","17","" "`P2539R4 `__","Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?","2022-11 (Kona)","|Complete|","18","" -"`P2602R2 `__","Poison Pills are Too Toxic","2022-11 (Kona)","|Complete|","19","Implemented as a DR in C++20" +"`P2602R2 `__","Poison Pills are Too Toxic","2022-11 (Kona)","|Complete|","19","Implemented as a DR in C++20." "`P2708R1 `__","No Further Fundamentals TSes","2022-11 (Kona)","|Nothing To Do|","","" "","","","","","" "`P0290R4 `__","``apply()`` for ``synchronized_value``","2023-02 (Issaquah)","","","" diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv index 65fd335a0309f..878471f1e782b 100644 --- a/libcxx/docs/Status/Cxx2cPapers.csv +++ b/libcxx/docs/Status/Cxx2cPapers.csv @@ -15,7 +15,7 @@ "`P1901R2 `__","Enabling the Use of ``weak_ptr`` as Keys in Unordered Associative Containers","2023-06 (Varna)","","","" "`P1885R12 `__","Naming Text Encodings to Demystify Them","2023-06 (Varna)","","","" "`P0792R14 `__","``function_ref``: a type-erased callable reference","2023-06 (Varna)","","","" -"`P2874R2 `__","Mandating Annex D Require No More","2023-06 (Varna)","","","" +"`P2874R2 `__","P2874R2: Mandating Annex D Require No More","2023-06 (Varna)","|Complete|","12","" "`P2757R3 `__","Type-checking format args","2023-06 (Varna)","","","" "`P2637R3 `__","Member ``visit``","2023-06 (Varna)","|Complete|","19","Change of ``__cpp_lib_variant`` is completed in LLVM 20. Change of ``__cpp_lib_format`` is blocked by `P2419R2 `__." "`P2641R4 `__","Checking if a ``union`` alternative is active","2023-06 (Varna)","","","" From ec544035227bd88e3622b85ba70499cb0e62b2bc Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 17 Feb 2025 10:26:10 -0500 Subject: [PATCH 026/127] [libc++] Synchronize a few remaining status page rows with Github issues --- libcxx/docs/Status/Cxx20Issues.csv | 2 +- libcxx/docs/Status/Cxx2cPapers.csv | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index ca286146840b1..3462557e8d668 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -238,7 +238,7 @@ "`LWG3313 `__","``join_view::iterator::operator--``\ is incorrectly constrained","2020-02 (Prague)","|Complete|","14","" "`LWG3314 `__","Is stream insertion behavior locale dependent when ``Period::type``\ is ``micro``\ ?","2020-02 (Prague)","|Complete|","16","" "`LWG3315 `__","LWG3315: Correct Allocator Default Behavior","2020-02 (Prague)","|Complete|","","" -"`LWG3316 `__","Correctly define epoch for ``utc_clock``\ / ``utc_timepoint``\ ","2020-02 (Prague)","|Nothing To Do|","","" +"`LWG3316 `__","Correctly define epoch for ``utc_clock``\ / ``utc_timepoint``\ ","2020-02 (Prague)","","","" "`LWG3317 `__","Incorrect ``operator<<``\ for floating-point durations","2020-02 (Prague)","|Complete|","16","" "`LWG3318 `__","Clarify whether clocks can represent time before their epoch","2020-02 (Prague)","","","" "`LWG3319 `__","Properly reference specification of IANA time zone database","2020-02 (Prague)","|Nothing To Do|","","" diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv index 878471f1e782b..b2bb1d6e9d6c3 100644 --- a/libcxx/docs/Status/Cxx2cPapers.csv +++ b/libcxx/docs/Status/Cxx2cPapers.csv @@ -2,7 +2,7 @@ "`P2497R0 `__","Testing for success or failure of ```` functions","2023-06 (Varna)","|Complete|","18","" "`P2592R3 `__","Hashing support for ``std::chrono`` value classes","2023-06 (Varna)","","","" "`P2587R3 `__","``to_string`` or not ``to_string``","2023-06 (Varna)","","","" -"`P2562R1 `__","``constexpr`` Stable Sorting","2023-06 (Varna)","|Partial|","20.0","" +"`P2562R1 `__","``constexpr`` Stable Sorting","2023-06 (Varna)","|Partial|","20","" "`P2545R4 `__","Read-Copy Update (RCU)","2023-06 (Varna)","","","" "`P2530R3 `__","Hazard Pointers for C++26","2023-06 (Varna)","","","" "`P2538R1 `__","ADL-proof ``std::projected``","2023-06 (Varna)","|Complete|","18","" @@ -24,7 +24,7 @@ "`P1383R2 `__","More ``constexpr`` for ```` and ````","2023-06 (Varna)","","","" "`P2734R0 `__","Adding the new SI prefixes","2023-06 (Varna)","|Complete|","17","" "`P2548R6 `__","``copyable_function``","2023-06 (Varna)","","","" -"`P2714R1 `__","Bind front and back to NTTP callables","2023-06 (Varna)","|Partial|","20","``not_fn`` only" +"`P2714R1 `__","Bind front and back to NTTP callables","2023-06 (Varna)","|Partial|","20","" "`P2630R4 `__","``submdspan``","2023-06 (Varna)","","","" "","","","","","" "`P0543R3 `__","Saturation arithmetic","2023-11 (Kona)","|Complete|","18","" From 788cb725d8b92a82e41e64540dccca97c9086a58 Mon Sep 17 00:00:00 2001 From: Brian Cain Date: Mon, 17 Feb 2025 09:30:48 -0600 Subject: [PATCH 027/127] [Hexagon] Explicitly truncate constant in UAddSubO (#127360) After #117558 landed, this code would assert "Value is not an N-bit unsigned value" in getConstant(), from a test case in zig. Co-authored-by: Craig Topper Fixes #127296 --- .../lib/Target/Hexagon/HexagonISelLowering.cpp | 2 +- llvm/test/CodeGen/Hexagon/iss127296.ll | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/Hexagon/iss127296.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 1a7667fe42fbc..b31360b4096da 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3273,7 +3273,7 @@ HexagonTargetLowering::LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const { if (Opc == ISD::USUBO) { SDValue Op = DAG.getNode(ISD::SUB, dl, VTs.VTs[0], {X, Y}); SDValue Ov = DAG.getSetCC(dl, MVT::i1, Op, - DAG.getConstant(-1, dl, ty(Op)), ISD::SETEQ); + DAG.getAllOnesConstant(dl, ty(Op)), ISD::SETEQ); return DAG.getMergeValues({Op, Ov}, dl); } } diff --git a/llvm/test/CodeGen/Hexagon/iss127296.ll b/llvm/test/CodeGen/Hexagon/iss127296.ll new file mode 100644 index 0000000000000..bf0e7a9881014 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/iss127296.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=hexagon -O0 < %s | FileCheck %s + +; CHECK: r0 = add(r0,#-1) + +define fastcc void @os.linux.tls.initStatic(i32 %x) { + %1 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 1) + br label %2 + + 2: ; preds = %0 + %3 = extractvalue { i32, i1 } %1, 0 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + From 6c627831f9a4ba5d9680cc83d610c1894a84908a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 17 Feb 2025 16:40:37 +0100 Subject: [PATCH 028/127] [VPlan] Use VPlan predecessors in VPWidenPHIRecipe (NFC). (#126388) Update VPWidenPHIRecipe to use the predecessors in VPlan to determine the incoming blocks instead of tracking them separately. This brings VPWidenPHIRecipe in line with the other phi recipes. PR: https://github.com/llvm/llvm-project/pull/126388 --- llvm/lib/Transforms/Vectorize/VPlan.h | 24 +++++-------------- .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 22 ++++++++++------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 21 ++++++++++++++++ llvm/lib/Transforms/Vectorize/VPlanUtils.h | 10 +------- .../LoopVectorize/outer-loop-wide-phis.ll | 2 +- .../Transforms/Vectorize/VPlanTest.cpp | 5 ++-- 6 files changed, 45 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index fbbc466f2f7f6..8089cfd1ce802 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1958,13 +1958,12 @@ class VPScalarPHIRecipe : public VPHeaderPHIRecipe { #endif }; -/// A recipe for handling phis that are widened in the vector loop. -/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are -/// managed in the recipe directly. +/// A recipe for widened phis. Incoming values are operands of the recipe and +/// their operand index corresponds to the incoming predecessor block. If the +/// recipe is placed in an entry block to a (non-replicate) region, it must have +/// exactly 2 incoming values, the first from the predecessor of the region and +/// the second from the exiting block of the region. class VPWidenPHIRecipe : public VPSingleDefRecipe { - /// List of incoming blocks. Only used in the VPlan native path. - SmallVector IncomingBlocks; - public: /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and /// debug location \p DL. @@ -1991,19 +1990,8 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe { VPSlotTracker &SlotTracker) const override; #endif - /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. - void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { - addOperand(IncomingV); - IncomingBlocks.push_back(IncomingBlock); - } - /// Returns the \p I th incoming VPBasicBlock. - VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } - - /// Set the \p I th incoming VPBasicBlock to \p IncomingBlock. - void setIncomingBlock(unsigned I, VPBasicBlock *IncomingBlock) { - IncomingBlocks[I] = IncomingBlock; - } + VPBasicBlock *getIncomingBlock(unsigned I); /// Returns the \p I th incoming VPValue. VPValue *getIncomingValue(unsigned I) { return getOperand(I); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 5a2e5d7cfee48..33a367a0b65c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -136,19 +136,23 @@ void PlainCFGBuilder::fixPhiNodes() { // predecessor is the first operand of the recipe. assert(Phi->getNumOperands() == 2); BasicBlock *LoopPred = L->getLoopPredecessor(); - VPPhi->addIncoming( - getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred)), - BB2VPBB[LoopPred]); + VPPhi->addOperand( + getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred))); BasicBlock *LoopLatch = L->getLoopLatch(); - VPPhi->addIncoming( - getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch)), - BB2VPBB[LoopLatch]); + VPPhi->addOperand( + getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch))); continue; } - for (unsigned I = 0; I != Phi->getNumOperands(); ++I) - VPPhi->addIncoming(getOrCreateVPOperand(Phi->getIncomingValue(I)), - BB2VPBB[Phi->getIncomingBlock(I)]); + // Add operands for VPPhi in the order matching its predecessors in VPlan. + DenseMap VPPredToIncomingValue; + for (unsigned I = 0; I != Phi->getNumOperands(); ++I) { + VPPredToIncomingValue[BB2VPBB[Phi->getIncomingBlock(I)]] = + getOrCreateVPOperand(Phi->getIncomingValue(I)); + } + for (VPBlockBase *Pred : VPPhi->getParent()->getPredecessors()) + VPPhi->addOperand( + VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock())); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1bba667c206cf..d57a6c481748c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3621,6 +3621,27 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +VPBasicBlock *VPWidenPHIRecipe::getIncomingBlock(unsigned I) { + VPBasicBlock *Parent = getParent(); + VPBlockBase *Pred = nullptr; + if (Parent->getNumPredecessors() > 0) { + Pred = Parent->getPredecessors()[I]; + } else { + auto *Region = Parent->getParent(); + assert(Region && !Region->isReplicator() && Region->getEntry() == Parent && + "must be in the entry block of a non-replicate region"); + assert( + I < 2 && getNumOperands() == 2 && + "when placed in an entry block, only 2 incoming blocks are available"); + + // I == 0 selects the predecessor of the region, I == 1 selects the region + // itself whose exiting block feeds the phi across the backedge. + Pred = I == 0 ? Region->getSinglePredecessor() : Region; + } + + return Pred->getExitingBasicBlock(); +} + void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index ac5e1978fcfbe..6ddb88308955f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -169,16 +169,8 @@ class VPBlockUtils { static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New) { for (auto *Pred : to_vector(Old->getPredecessors())) Pred->replaceSuccessor(Old, New); - for (auto *Succ : to_vector(Old->getSuccessors())) { + for (auto *Succ : to_vector(Old->getSuccessors())) Succ->replacePredecessor(Old, New); - - // Replace any references to Old in widened phi incoming blocks. - for (auto &R : Succ->getEntryBasicBlock()->phis()) - if (auto *WidenPhiR = dyn_cast(&R)) - for (unsigned I = 0; I < WidenPhiR->getNumOperands(); I++) - if (WidenPhiR->getIncomingBlock(I) == Old) - WidenPhiR->setIncomingBlock(I, cast(New)); - } New->setPredecessors(Old->getPredecessors()); New->setSuccessors(Old->getSuccessors()); Old->clearPredecessors(); diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll index 3f81c0f5c822a..c5d2f6acf85b3 100644 --- a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll @@ -134,7 +134,7 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) ; CHECK-NEXT: br label %[[INNER_LATCH4]] ; CHECK: [[INNER_LATCH4]]: -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ zeroinitializer, %[[INNER_HEADER1]] ], [ [[WIDE_MASKED_GATHER]], %[[THEN3]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[THEN3]] ], [ zeroinitializer, %[[INNER_HEADER1]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i64> [[VEC_PHI5]], [[VEC_IND]] ; CHECK-NEXT: [[TMP3]] = add nsw <4 x i64> [[TMP2]], [[VEC_PHI2]] ; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1) diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 2f37c08bd9f11..5f73aa43daef9 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -672,7 +672,7 @@ TEST_F(VPBasicBlockTest, reassociateBlocks) { auto *WidenPhi = new VPWidenPHIRecipe(nullptr); IntegerType *Int32 = IntegerType::get(C, 32); VPValue *Val = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); - WidenPhi->addIncoming(Val, VPBB1); + WidenPhi->addOperand(Val); VPBB2->appendRecipe(WidenPhi); VPBasicBlock *VPBBNew = Plan.createVPBasicBlock("VPBBNew"); @@ -693,7 +693,8 @@ TEST_F(VPBasicBlockTest, reassociateBlocks) { auto *WidenPhi = new VPWidenPHIRecipe(nullptr); IntegerType *Int32 = IntegerType::get(C, 32); VPValue *Val = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); - WidenPhi->addIncoming(Val, VPBB1); + WidenPhi->addOperand(Val); + WidenPhi->addOperand(Val); VPBB2->appendRecipe(WidenPhi); VPBasicBlock *VPBBNew = Plan.createVPBasicBlock("VPBBNew"); From c5ea469f4dafe4c310ba26511575afda3569b0b5 Mon Sep 17 00:00:00 2001 From: Pranav Bhandarkar Date: Mon, 17 Feb 2025 09:45:06 -0600 Subject: [PATCH 029/127] [OMPIRBuilder] - Fix emitTargetTaskProxyFunc to not generate empty functions (#126958) This is a fix for https://github.com/llvm/llvm-project/issues/126949 There are two issues being fixed here. First, in some cases, OMPIRBuilder generates empty target task proxy functions. This happens when the target kernel doesn't use any stack-allocated data (either no data or only globals). The second problem is encountered when the target task i.e the code that makes the target call spans a single basic block. This usually happens when we do not generate a target or device kernel launch and instead fall back to the host. In such cases, we end up not outlining the target task entirely. This can cause us to call target kernel twice - once via the target task proxy function and a second time via the host fallback This PR fixes both of these problems and updates some tests to catch these problems should this patch fail. --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 31 ++++++++++++++----- .../LLVMIR/omptarget-depend-host-only.mlir | 21 +++++++++++++ .../LLVMIR/omptarget-nowait-host-only.mlir | 20 ++++++++++++ 3 files changed, 65 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 04acab1e5765e..7ba23b0bd377e 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -7099,10 +7099,11 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, Function *KernelLaunchFunction = StaleCI->getCalledFunction(); // StaleCI is the CallInst which is the call to the outlined - // target kernel launch function. If there are values that the - // outlined function uses then these are aggregated into a structure - // which is passed as the second argument. If not, then there's - // only one argument, the threadID. So, StaleCI can be + // target kernel launch function. If there are local live-in values + // that the outlined function uses then these are aggregated into a structure + // which is passed as the second argument. If there are no local live-in + // values or if all values used by the outlined kernel are global variables, + // then there's only one argument, the threadID. So, StaleCI can be // // %structArg = alloca { ptr, ptr }, align 8 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0 @@ -7140,6 +7141,8 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, // host and device. assert((!HasShareds || (StaleCI->arg_size() == 2)) && "StaleCI with shareds should have exactly two arguments."); + + Value *ThreadId = ProxyFn->getArg(0); if (HasShareds) { auto *ArgStructAlloca = dyn_cast(StaleCI->getArgOperand(1)); assert(ArgStructAlloca && @@ -7150,7 +7153,6 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, AllocaInst *NewArgStructAlloca = Builder.CreateAlloca(ArgStructType, nullptr, "structArg"); Value *TaskT = ProxyFn->getArg(1); - Value *ThreadId = ProxyFn->getArg(0); Value *SharedsSize = Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); @@ -7163,7 +7165,10 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize); Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca}); + } else { + Builder.CreateCall(KernelLaunchFunction, {ThreadId}); } + Builder.CreateRetVoid(); return ProxyFn; } @@ -7306,11 +7311,23 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false)); Builder.restoreIP(TargetTaskBodyIP); - if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP)) return Err; - OI.ExitBB = Builder.saveIP().getBlock(); + // The outliner (CodeExtractor) extract a sequence or vector of blocks that + // it is given. These blocks are enumerated by + // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock + // to be outside the region. In other words, OI.ExitBlock is expected to be + // the start of the region after the outlining. We used to set OI.ExitBlock + // to the InsertBlock after TaskBodyCB is done. This is fine in most cases + // except when the task body is a single basic block. In that case, + // OI.ExitBlock is set to the single task body block and will get left out of + // the outlining process. So, simply create a new empty block to which we + // uncoditionally branch from where TaskBodyCB left off + OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont"); + emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(), + /*IsFinished=*/true); + OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait, DeviceID](Function &OutlinedFn) mutable { assert(OutlinedFn.getNumUses() == 1 && diff --git a/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir index 621a206e18053..ece32bb5419c6 100644 --- a/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir @@ -25,8 +25,29 @@ module attributes {omp.is_target_device = false} { // CHECK: define void @omp_target_depend_() // CHECK-NOT: define {{.*}} @ // CHECK-NOT: call i32 @__tgt_target_kernel({{.*}}) +// CHECK: call void @__kmpc_omp_task_begin_if0 +// CHECK-NEXT: call void @.omp_target_task_proxy_func +// CHECK: call void @__kmpc_omp_task_complete_if0 +// https://github.com/llvm/llvm-project/issues/126949 exposes two issues +// 1. Empty target task proxy functions +// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice +// Once via the target task proxy function and a second time after the target task is done. +// The following checks check problem #2. +// functions. The following checks tests the fix for this issue. +// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]] +// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]: +// CHECK-NEXT: ret void + +// CHECK: define internal void @omp_target_depend_..omp_par // CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_depend__l[[LINE:.*]](ptr {{.*}}) +// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]] +// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]: // CHECK-NEXT: ret void + // CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_depend__l[[LINE]](ptr %[[ADDR_A:.*]]) // CHECK: store i32 100, ptr %[[ADDR_A]], align 4 + +// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949 +// CHECK: define internal void @.omp_target_task_proxy_func +// CHECK: call void @omp_target_depend_..omp_par diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir index 6b634226a3568..94d8d052d087e 100644 --- a/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir @@ -20,10 +20,30 @@ module attributes {omp.is_target_device = false} { // CHECK: define void @omp_target_nowait_() // CHECK-NOT: define {{.*}} @ // CHECK-NOT: call ptr @__kmpc_omp_target_task_alloc({{.*}}) +// CHECK: call void @__kmpc_omp_task_begin_if0 +// CHECK-NEXT: call void @.omp_target_task_proxy_func +// CHECK: call void @__kmpc_omp_task_complete_if0 +// https://github.com/llvm/llvm-project/issues/126949 exposes two issues +// 1. Empty target task proxy functions +// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice +// Once via the target task proxy function and a second time after the target task is done. +// The following checks check problem #2. +// functions. The following checks tests the fix for this issue. +// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]] +// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]: +// CHECK-NEXT: ret void + // Verify that we directly emit a call to the "target" region's body from the // parent function of the the `omp.target` op. +// CHECK: define internal void @omp_target_nowait_..omp_par // CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_nowait__l[[LINE:.*]](ptr {{.*}}) +// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]] +// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]: // CHECK-NEXT: ret void // CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_nowait__l[[LINE]](ptr %[[ADDR_X:.*]]) // CHECK: store float 5{{.*}}, ptr %[[ADDR_X]], align 4 + +// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949 +// CHECK: define internal void @.omp_target_task_proxy_func +// CHECK: call void @omp_target_nowait_..omp_par From d29045622a1db3cd3729c1fe3fba7eebd5536517 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 17 Feb 2025 15:50:25 +0000 Subject: [PATCH 030/127] [X86] combineConcatVectorOps - fold concat(EXTEND_VECTOR_INREG(x),EXTEND_VECTOR_INREG(y)) -> EXTEND_VECTOR_INREG(unpack(x,y)) (#127502) Concat/unpack the src subvectors together in the bottom 128-bit vector and then extend with a single EXTEND/EXTEND_VECTOR_INREG instruction Required the getEXTEND_VECTOR_INREG helper to be tweaked to accept EXTEND_VECTOR_INREG opcodes as well to avoid us having to remap the opcode between both types. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 45 +- .../vector-interleaved-store-i8-stride-8.ll | 3934 +++++++---------- llvm/test/CodeGen/X86/widen_bitcnt.ll | 256 +- 3 files changed, 1885 insertions(+), 2350 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 85ad391ade299..6ed69dbd6dae0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4679,9 +4679,24 @@ static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); - assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || - ISD::ZERO_EXTEND == Opcode) && - "Unknown extension opcode"); + + // Canonicalize Opcode to general extension version. + switch (Opcode) { + case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND_VECTOR_INREG: + Opcode = ISD::ANY_EXTEND; + break; + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: + Opcode = ISD::SIGN_EXTEND; + break; + case ISD::ZERO_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + Opcode = ISD::ZERO_EXTEND; + break; + default: + llvm_unreachable("Unknown extension opcode"); + } // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. @@ -57864,6 +57879,30 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } break; + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: { + // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND. + if (!IsSplat && NumOps == 2 && VT.is256BitVector() && + Subtarget.hasInt256() && + Op0.getOperand(0).getValueType().is128BitVector() && + Op0.getOperand(0).getValueType() == + Ops[0].getOperand(0).getValueType()) { + EVT SrcVT = Op0.getOperand(0).getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + MVT UnpackSVT = + MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2)); + MVT UnpackVT = + MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits()); + SDValue Unpack = + DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT, + DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)), + DAG.getBitcast(UnpackVT, Ops[1].getOperand(0))); + return getEXTEND_VECTOR_INREG(Op0.getOpcode(), DL, VT, + DAG.getBitcast(SrcVT, Unpack), DAG); + } + break; + } case X86ISD::VSHLI: case X86ISD::VSRLI: // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle. diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 675412defbb24..6fee9377d261a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -2982,223 +2982,182 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-LABEL: store_i8_stride8_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: subq $40, %rsp ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX2-NEXT: vmovdqa (%r10), %xmm6 -; AVX2-NEXT: vmovdqa (%rax), %xmm7 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 -; AVX2-NEXT: vmovdqa (%r9), %xmm10 -; AVX2-NEXT: vmovdqa (%r8), %xmm11 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm15 -; AVX2-NEXT: vmovaps 16(%rsi), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5,6,7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13,14,15] -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4],ymm15[5],ymm0[6],ymm15[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 -; AVX2-NEXT: vmovdqa 16(%rdx), %xmm15 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5],ymm13[6],ymm1[7],ymm13[8,9,10,11,12],ymm1[13],ymm13[14],ymm1[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5],ymm7[6],ymm2[7],ymm7[8,9,10,11,12],ymm2[13],ymm7[14],ymm2[15] +; AVX2-NEXT: vmovdqa (%r10), %xmm3 +; AVX2-NEXT: vmovdqa (%rax), %xmm4 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa (%r9), %xmm6 +; AVX2-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15] +; AVX2-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero +; AVX2-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7,8],ymm15[9],ymm9[10,11,12],ymm15[13],ymm9[14,15] +; AVX2-NEXT: vmovaps 16(%r10), %xmm7 +; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm12[1],ymm15[2],ymm12[3],ymm15[4],ymm12[5],ymm15[6],ymm12[7] +; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0,1,2,3,4],ymm2[5],ymm11[6],ymm2[7],ymm11[8,9,10,11,12],ymm2[13],ymm11[14],ymm2[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX2-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7,8],ymm11[9],ymm13[10,11,12],ymm11[13],ymm13[14,15] +; AVX2-NEXT: vmovdqa 16(%r8), %xmm15 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-NEXT: vmovdqa %xmm9, %xmm5 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX2-NEXT: vmovdqa %xmm8, %xmm9 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7,8],ymm13[9],ymm7[10,11,12],ymm13[13],ymm7[14,15] -; AVX2-NEXT: vmovdqa 16(%rax), %xmm4 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 -; AVX2-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX2-NEXT: vmovdqa 16(%r8), %xmm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm4, %ymm11 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6],ymm11[7],ymm0[8,9,10,11,12],ymm11[13],ymm0[14],ymm11[15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7,8],ymm7[9],ymm0[10,11,12],ymm7[13],ymm0[14,15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6],ymm5[7],ymm3[8,9,10,11,12],ymm5[13],ymm3[14],ymm5[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5,6,7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5,6,7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm8, %ymm8 +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6],ymm0[7],ymm8[8,9,10,11,12],ymm0[13],ymm8[14],ymm0[15] +; AVX2-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX2-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6],ymm6[7],ymm14[8,9,10,11,12],ymm6[13],ymm14[14],ymm6[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm4, %ymm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7] +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5,6,7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13,14,15] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5,6,7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13,14,15] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm1, 128(%rax) +; AVX2-NEXT: vmovdqa %ymm12, 192(%rax) ; AVX2-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-NEXT: vmovdqa %ymm8, 160(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%rax) -; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: addq $40, %rsp ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3508,166 +3467,134 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%r10), %xmm1 ; AVX512-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX512-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-NEXT: vmovdqa (%rax), %xmm5 ; AVX512-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vmovdqa (%r9), %xmm3 -; AVX512-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512-NEXT: vmovdqa (%r8), %xmm4 -; AVX512-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vmovdqa (%r9), %xmm6 +; AVX512-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX512-NEXT: vmovdqa (%r8), %xmm7 +; AVX512-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512-NEXT: vmovdqa (%rdx), %xmm9 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm10 -; AVX512-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7,8],ymm13[9],ymm3[10,11,12],ymm13[13],ymm3[14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm18 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa 16(%rcx), %xmm2 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7],ymm1[8,9,10],ymm3[11],ymm1[12,13,14],ymm3[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm3 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] ; AVX512-NEXT: vmovdqa 16(%rdx), %xmm10 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm15 -; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm15 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm1, %ymm19 +; AVX512-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7,8],ymm4[9],ymm1[10,11,12],ymm4[13],ymm1[14,15] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3],ymm11[4,5,6],ymm1[7],ymm11[8,9,10],ymm1[11],ymm11[12,13,14],ymm1[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm11, %ymm4 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 @@ -3684,40 +3611,32 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7,8],ymm2[9],ymm5[10,11,12],ymm2[13],ymm5[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -3890,212 +3809,176 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm6 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm9 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm7 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6],ymm9[7],ymm1[8,9,10],ymm9[11],ymm1[12,13,14],ymm9[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5,6],ymm0[7],ymm9[8,9,10],ymm0[11],ymm9[12,13,14],ymm0[15] -; AVX512DQ-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm14 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm10 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 -; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm9 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm11 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7],ymm2[8,9,10],ymm4[11],ymm2[12,13,14],ymm4[15] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm17 {%k1} -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm10 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7,8],ymm12[9],ymm15[10,11,12],ymm12[13],ymm15[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] +; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm14 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm16 +; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm16 {%k1} +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm17 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm18 +; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm12, %ymm19 +; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm15 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm20 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm0, %ymm21 +; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7,8],ymm12[9],ymm2[10,11,12],ymm12[13],ymm2[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm12 {%k1} +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm11 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0,0,1,1,2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7,8],ymm9[9],ymm5[10,11,12],ymm9[13],ymm5[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm7 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm4 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -4266,102 +4149,58 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] -; AVX512BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa (%r11), %xmm6 -; AVX512BW-NEXT: vmovdqa 16(%r11), %xmm15 -; AVX512BW-NEXT: vmovdqa (%r10), %xmm7 -; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm17 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm18 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm9 -; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm19 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20 +; AVX512BW-NEXT: vmovdqa (%r11), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%r11), %xmm1 +; AVX512BW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512BW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512BW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512BW-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512BW-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512BW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512BW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm18 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15] -; AVX512BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -4430,102 +4269,58 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] -; AVX512DQ-BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm6 -; AVX512DQ-BW-NEXT: vmovdqa 16(%r11), %xmm15 -; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm17 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm18 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm19 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r11), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm18 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -5923,407 +5718,336 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: subq $328, %rsp # imm = 0x148 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-NEXT: vmovdqa (%r10), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 48(%r10), %xmm11 +; AVX2-NEXT: vmovdqa (%rax), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6],ymm1[7],ymm4[8,9,10,11,12],ymm1[13],ymm4[14],ymm1[15] +; AVX2-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rax), %xmm1 +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%r8), %xmm6 +; AVX2-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vmovdqa 48(%rsi), %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6],ymm1[7],ymm7[8,9,10,11,12],ymm1[13],ymm7[14],ymm1[15] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm5 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7,8],ymm1[9],ymm7[10,11,12],ymm1[13],ymm7[14,15] +; AVX2-NEXT: vmovdqa 48(%rax), %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5,6,7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7] +; AVX2-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa 48(%r9), %xmm3 +; AVX2-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5,6,7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13,14,15] ; AVX2-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 -; AVX2-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX2-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm12 -; AVX2-NEXT: vmovdqa 48(%r9), %xmm6 -; AVX2-NEXT: vmovdqa 48(%r8), %xmm7 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5,6,7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] +; AVX2-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm6 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm13 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX2-NEXT: vmovdqa 48(%rsi), %xmm6 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7,8],ymm13[9],ymm15[10,11,12],ymm13[13],ymm15[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4],ymm10[5],ymm13[6],ymm10[7] ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6],ymm8[7],ymm9[8,9,10,11,12],ymm8[13],ymm9[14],ymm8[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7,8],ymm9[9],ymm0[10,11,12],ymm9[13],ymm0[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6],ymm0[7],ymm1[8,9,10,11,12],ymm0[13],ymm1[14],ymm0[15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6],ymm0[7],ymm2[8,9,10,11,12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX2-NEXT: vmovdqa 32(%rax), %xmm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm4 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5,6,7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13,14,15] ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX2-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm12 -; AVX2-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm6 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5,6,7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7,8],ymm12[9],ymm15[10,11,12],ymm12[13],ymm15[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6],ymm8[7],ymm9[8,9,10,11,12],ymm8[13],ymm9[14],ymm8[15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7,8],ymm9[9],ymm10[10,11,12],ymm9[13],ymm10[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6],ymm0[7],ymm1[8,9,10,11,12],ymm0[13],ymm1[14],ymm0[15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6],ymm1[7],ymm2[8,9,10,11,12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovdqa 16(%rsi), %xmm14 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm11 -; AVX2-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-NEXT: vmovdqa 16(%r10), %xmm7 -; AVX2-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 +; AVX2-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX2-NEXT: vmovdqa 16(%rax), %xmm10 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 16(%r9), %xmm5 ; AVX2-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5,6,7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5,6,7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13,14,15] +; AVX2-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX2-NEXT: vpshufb %xmm14, %xmm15, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7,8],ymm0[9],ymm13[10,11,12],ymm0[13],ymm13[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm9[1],ymm0[2],ymm9[3],ymm0[4],ymm9[5],ymm0[6],ymm9[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm8, %ymm8 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6],ymm0[7],ymm8[8,9,10,11,12],ymm0[13],ymm8[14],ymm0[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm8, %ymm8 +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7,8],ymm8[9],ymm1[10,11,12],ymm8[13],ymm1[14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5,6,7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4,5,6,7,8],ymm0[9],ymm5[10],ymm0[11],ymm5[12,13,14,15] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-NEXT: vpshufb %xmm14, %xmm3, %xmm5 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,5,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6],ymm2[7],ymm3[8,9,10,11,12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6],ymm1[7],ymm4[8,9,10,11,12],ymm1[13],ymm4[14],ymm1[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4,5,6,7,8],ymm3[9],ymm5[10],ymm3[11],ymm5[12,13,14,15] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5,6,7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6],ymm4[7],ymm5[8,9,10,11,12],ymm4[13],ymm5[14],ymm4[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX2-NEXT: vpshufb %xmm14, %xmm6, %xmm7 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7,8],ymm7[9],ymm10[10,11,12],ymm7[13],ymm10[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6],ymm2[7],ymm4[8,9,10,11,12],ymm2[13],ymm4[14],ymm2[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-NEXT: vmovdqa %ymm8, 224(%rax) -; AVX2-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-NEXT: vmovdqa %ymm9, 192(%rax) ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6992,147 +6716,145 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride8_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX512-NEXT: subq $552, %rsp # imm = 0x228 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX512-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX512-NEXT: vmovdqa (%rax), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512-NEXT: vmovdqa 48(%rcx), %xmm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm6 +; AVX512-NEXT: vmovdqa (%r10), %xmm5 +; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 48(%r10), %xmm3 +; AVX512-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 48(%rax), %xmm4 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %xmm5 +; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa 48(%r9), %xmm7 -; AVX512-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa 48(%r8), %xmm12 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm9 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm11 -; AVX512-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm31 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm28 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm21 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm20 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm30 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm24 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm22 -; AVX512-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm8 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm9 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm13, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm10 +; AVX512-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm13, %ymm28 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm30 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm27 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm26 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm22 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm20 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm18 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm3, %ymm25 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm21 +; AVX512-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7] ; AVX512-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm18 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm4 +; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm17 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm4 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 @@ -7141,9 +6863,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 16(%rcx), %xmm14 -; AVX512-NEXT: vmovdqa 16(%rdx), %xmm12 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512-NEXT: vmovdqa 16(%rdx), %xmm7 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 @@ -7153,83 +6875,65 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 16(%r10), %xmm1 -; AVX512-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512-NEXT: vmovdqa 16(%rax), %xmm15 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm16 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 16(%r9), %xmm0 -; AVX512-NEXT: vmovdqa 16(%r8), %xmm15 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm29 +; AVX512-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX512-NEXT: vmovdqa 16(%r8), %xmm12 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm13, %ymm1, %ymm31 +; AVX512-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm27 -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm6 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm13 & (zmm10 ^ zmm3)) -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm13 & (zmm6 ^ zmm0)) +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512-NEXT: vpandnq %zmm4, %zmm3, %zmm4 -; AVX512-NEXT: vpandq %zmm3, %zmm5, %zmm5 +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512-NEXT: vpandq %zmm8, %zmm3, %zmm3 ; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpord %zmm4, %zmm5, %zmm10 {%k1} -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm13 & (zmm9 ^ zmm0)) +; AVX512-NEXT: vpord %zmm0, %zmm3, %zmm6 {%k1} +; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm0 ^ (zmm13 & (zmm9 ^ zmm0)) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -7237,238 +6941,196 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpshufd $232, (%rsp), %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512-NEXT: vpandq %zmm8, %zmm2, %zmm2 ; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm13 & (zmm11 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm13 & (zmm10 ^ zmm0)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpandq %zmm3, %zmm6, %zmm6 -; AVX512-NEXT: vpord %zmm0, %zmm6, %zmm4 {%k1} -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512-NEXT: vpandq %zmm8, %zmm1, %zmm1 +; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm10 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm27 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm18 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm13 & (zmm4 ^ zmm2)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512-NEXT: vpandnq %zmm2, %zmm8, %zmm2 +; AVX512-NEXT: vpandq %zmm8, %zmm5, %zmm5 +; AVX512-NEXT: vpord %zmm2, %zmm5, %zmm4 {%k1} +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm2, %ymm17 +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm3, %ymm21 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm2, %ymm21 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm22 -; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm7, %ymm19 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX512-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm14 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm15 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX512-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm12, %ymm8, %ymm17 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm8, %ymm20 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm3, %ymm22 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm2, %ymm30 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm12, %ymm14 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm12, %ymm15 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm19 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm20 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX512-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm8 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm7 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12)) -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12 -; AVX512-NEXT: vpandnq %zmm12, %zmm3, %zmm12 +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm12 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512-NEXT: # ymm23 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512-NEXT: # ymm24 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm24 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm24, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm23 ^ (zmm13 & (zmm5 ^ zmm23)) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload ; AVX512-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload ; AVX512-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512-NEXT: vpandq %zmm3, %zmm23, %zmm23 -; AVX512-NEXT: vpord %zmm12, %zmm23, %zmm5 {%k1} -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm7[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm24 = xmm7[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm13 & (zmm12 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm24 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512-NEXT: vpandnq %zmm25, %zmm3, %zmm25 -; AVX512-NEXT: vpandq %zmm3, %zmm16, %zmm16 -; AVX512-NEXT: vpord %zmm25, %zmm16, %zmm12 {%k1} -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm29, %ymm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 -; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm13 & (zmm16 ^ zmm7)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 -; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512-NEXT: vpandnq %zmm7, %zmm3, %zmm7 -; AVX512-NEXT: vpandq %zmm3, %zmm6, %zmm6 -; AVX512-NEXT: vpord %zmm7, %zmm6, %zmm16 {%k1} -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512-NEXT: # ymm26 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm25 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm26 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX512-NEXT: vpandnq %zmm23, %zmm8, %zmm23 +; AVX512-NEXT: vpandq %zmm8, %zmm25, %zmm25 +; AVX512-NEXT: vpord %zmm23, %zmm25, %zmm5 {%k1} +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm23 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512-NEXT: # ymm25 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512-NEXT: # ymm28 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm25, %zmm25 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm24, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm25 ^ (zmm13 & (zmm11 ^ zmm25)) +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512-NEXT: # ymm24 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm29[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm24 +; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm27[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm25, %zmm16 +; AVX512-NEXT: vpandnq %zmm24, %zmm8, %zmm24 +; AVX512-NEXT: vpandq %zmm8, %zmm16, %zmm16 +; AVX512-NEXT: vpord %zmm24, %zmm16, %zmm11 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm18[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm26, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm16 ^ (zmm13 & (zmm7 ^ zmm16)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm21[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm22[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vpandnq %zmm16, %zmm8, %zmm3 +; AVX512-NEXT: vpandq %zmm8, %zmm2, %zmm2 +; AVX512-NEXT: vpord %zmm3, %zmm2, %zmm7 {%k1} ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm13 & (zmm2 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm23, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm13 & (zmm3 ^ zmm2)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandnq %zmm2, %zmm8, %zmm1 +; AVX512-NEXT: vpandq %zmm8, %zmm0, %zmm0 +; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm3 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 448(%rax) ; AVX512-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -7766,147 +7428,145 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-LABEL: store_i8_stride8_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX512DQ-NEXT: subq $552, %rsp # imm = 0x228 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm6 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm5 +; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 48(%r10), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 48(%rax), %xmm4 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 48(%r9), %xmm7 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 48(%r8), %xmm12 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm9 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm11 -; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm31 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm28 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm21 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm20 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm30 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm24 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm22 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm8 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm9 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm13, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm10 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm13, %ymm28 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm30 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm27 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm26 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm22 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm20 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm18 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm3, %ymm25 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm21 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm18 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm4 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm17 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm4 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 @@ -7915,9 +7575,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm14 -; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm12 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm7 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 @@ -7927,83 +7587,65 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm15 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm16 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm15 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm29 +; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm12 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm1, %ymm31 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm27 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm6 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm13 & (zmm10 ^ zmm3)) -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm13 & (zmm6 ^ zmm0)) +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm3 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512DQ-NEXT: vpandnq %zmm4, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm8, %zmm3, %zmm3 ; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vpord %zmm4, %zmm5, %zmm10 {%k1} -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm13 & (zmm9 ^ zmm0)) +; AVX512DQ-NEXT: vpord %zmm0, %zmm3, %zmm6 {%k1} +; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm0 ^ (zmm13 & (zmm9 ^ zmm0)) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -8011,238 +7653,196 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpshufd $232, (%rsp), %ymm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm3 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm8, %zmm2, %zmm2 ; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm13 & (zmm11 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm13 & (zmm10 ^ zmm0)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpord %zmm0, %zmm6, %zmm4 {%k1} -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm8, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm8, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm10 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm27 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm18 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm13 & (zmm4 ^ zmm2)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm8, %zmm2 +; AVX512DQ-NEXT: vpandq %zmm8, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpord %zmm2, %zmm5, %zmm4 {%k1} +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm2, %ymm17 +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm3, %ymm21 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm2, %ymm21 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm22 -; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm7, %ymm19 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm14 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm15 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm8, %ymm17 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm8, %ymm20 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm3, %ymm22 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm2, %ymm30 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm12, %ymm14 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm12, %ymm15 +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm19 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm20 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX512DQ-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm7 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12)) -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpandnq %zmm12, %zmm3, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm12 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] +; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm23 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm24 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm24 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm24, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm23 ^ (zmm13 & (zmm5 ^ zmm23)) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm23, %zmm23 -; AVX512DQ-NEXT: vpord %zmm12, %zmm23, %zmm5 {%k1} -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm7[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm24 = xmm7[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm13 & (zmm12 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm24 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512DQ-NEXT: vpandnq %zmm25, %zmm3, %zmm25 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpord %zmm25, %zmm16, %zmm12 {%k1} -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm29, %ymm7 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm13 & (zmm16 ^ zmm7)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512DQ-NEXT: vpandnq %zmm7, %zmm3, %zmm7 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpord %zmm7, %zmm6, %zmm16 {%k1} -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm26 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm25 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm26 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] +; AVX512DQ-NEXT: vpandnq %zmm23, %zmm8, %zmm23 +; AVX512DQ-NEXT: vpandq %zmm8, %zmm25, %zmm25 +; AVX512DQ-NEXT: vpord %zmm23, %zmm25, %zmm5 {%k1} +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm23 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm25 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm28 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm25, %zmm25 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm24, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm25 ^ (zmm13 & (zmm11 ^ zmm25)) +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm24 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm29[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm24 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm27[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm25, %zmm16 +; AVX512DQ-NEXT: vpandnq %zmm24, %zmm8, %zmm24 +; AVX512DQ-NEXT: vpandq %zmm8, %zmm16, %zmm16 +; AVX512DQ-NEXT: vpord %zmm24, %zmm16, %zmm11 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm18[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm26, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm16 ^ (zmm13 & (zmm7 ^ zmm16)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm21[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm22[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpandnq %zmm16, %zmm8, %zmm3 +; AVX512DQ-NEXT: vpandq %zmm8, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpord %zmm3, %zmm2, %zmm7 {%k1} ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm13 & (zmm2 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm23, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm13 & (zmm3 ^ zmm2)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm8, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm8, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm3 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQ-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -8576,22 +8176,16 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm24 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm27 +; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm26 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 ; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm28 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm29 +; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm30 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7] @@ -8621,18 +8215,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm14, %ymm22, %ymm14 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm20 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 @@ -8655,144 +8243,120 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm21 -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm19 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm27 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] ; AVX512BW-NEXT: vpermw %ymm18, %ymm6, %ymm18 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm26 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 +; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm29 ; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,4,6,6,7] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm15, %ymm24, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm15, %zmm15 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm21 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm15 {%k2} +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,6,5,7,7] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm21, %ymm6, %ymm21 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm27 -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} +; AVX512BW-NEXT: vpermw %ymm19, %ymm6, %ymm19 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm19, %zmm24 +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm24 {%k1} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm24, %ymm18 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm24, %ymm28 -; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm24 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm21 -; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm28 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[2,3,2,3] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm19[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm21, %ymm26 +; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm21 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm26, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm26 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm18 {%k3} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm17, %ymm17 -; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm27 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm16[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm17, %ymm17 +; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm24 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] ; AVX512BW-NEXT: vpermw %ymm16, %ymm3, %ymm16 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm25 -; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm17 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm17[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm17[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 +; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm22 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm22, %ymm6, %ymm22 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 -; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm22 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1} +; AVX512BW-NEXT: vpermw %ymm17, %ymm6, %ymm17 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm17, %zmm17 +; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm25 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm17 {%k1} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm23, %ymm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm16 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[2,3,2,3] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm16 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm27[8],xmm29[9],xmm27[9],xmm29[10],xmm27[10],xmm29[11],xmm27[11],xmm29[12],xmm27[12],xmm29[13],xmm27[13],xmm29[14],xmm27[14],xmm29[15],xmm27[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm20[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm23, %ymm23 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm16 {%k3} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm17[0,1,2,3,4,6,6,7] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm19, %ymm3, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] +; AVX512BW-NEXT: vpermw %ymm17, %ymm3, %ymm17 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm26[0],xmm21[0],xmm26[1],xmm21[1],xmm26[2],xmm21[2],xmm26[3],xmm21[3],xmm26[4],xmm21[4],xmm26[5],xmm21[5],xmm26[6],xmm21[6],xmm26[7],xmm21[7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm20[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm23, %ymm23 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] ; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm25, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm19 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm23 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[2,3,2,3] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm23, %zmm17 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm23[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm23[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm27, %ymm27 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[2,1,3,3,6,5,7,7] ; AVX512BW-NEXT: vpermw %ymm23, %ymm11, %ymm23 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm23 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k3} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm27, %zmm23, %zmm23 +; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k3} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7] @@ -8800,33 +8364,27 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] ; AVX512BW-NEXT: vpermw %ymm12, %ymm3, %ymm12 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm26[8],xmm21[8],xmm26[9],xmm21[9],xmm26[10],xmm21[10],xmm26[11],xmm21[11],xmm26[12],xmm21[12],xmm26[13],xmm21[13],xmm26[14],xmm21[14],xmm26[15],xmm21[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm13[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] ; AVX512BW-NEXT: vpermw %ymm13, %ymm6, %ymm13 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm12, %ymm21, %ymm12 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm19, %zmm12 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,6,6,7] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm17, %ymm11, %ymm17 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm12 {%k2} +; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm12 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] @@ -8844,16 +8402,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] @@ -8866,7 +8418,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax) @@ -9086,22 +8638,16 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm24 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm26 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm28 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm30 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7] @@ -9131,18 +8677,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm14, %ymm22, %ymm14 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm20 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 @@ -9165,144 +8705,120 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm19 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm27 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm6, %ymm18 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm26 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm29 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm15, %ymm24, %ymm15 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm15, %zmm15 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,6,5,7,7] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm6, %ymm21 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} +; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm6, %ymm19 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm19, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm24 {%k1} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm24, %ymm18 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm24, %ymm28 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm24 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm28 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm21 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm19[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm21, %ymm26 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm21 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm26, %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm26 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm18 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm19 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm18 {%k3} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm16[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm17, %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm24 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm3, %ymm16 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm25 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm17 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm17[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm17[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm22 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm22, %ymm6, %ymm22 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm22 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm6, %ymm17 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm17, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm25 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm17 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm23, %ymm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm16 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm16 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm27[8],xmm29[9],xmm27[9],xmm29[10],xmm27[10],xmm29[11],xmm27[11],xmm29[12],xmm27[12],xmm29[13],xmm27[13],xmm29[14],xmm27[14],xmm29[15],xmm27[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm20[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm23, %ymm23 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm16 {%k3} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm17[0,1,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm3, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] +; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm3, %ymm17 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm26[0],xmm21[0],xmm26[1],xmm21[1],xmm26[2],xmm21[2],xmm26[3],xmm21[3],xmm26[4],xmm21[4],xmm26[5],xmm21[5],xmm26[6],xmm21[6],xmm26[7],xmm21[7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm20[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm23, %ymm23 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm25, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm23 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm23, %zmm17 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm23[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm23[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm27, %ymm27 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[2,1,3,3,6,5,7,7] ; AVX512DQ-BW-NEXT: vpermw %ymm23, %ymm11, %ymm23 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k3} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm27, %zmm23, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k3} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7] @@ -9310,33 +8826,27 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm3, %ymm12 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm26[8],xmm21[8],xmm26[9],xmm21[9],xmm26[10],xmm21[10],xmm26[11],xmm21[11],xmm26[12],xmm21[12],xmm26[13],xmm21[13],xmm26[14],xmm21[14],xmm26[15],xmm21[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm13[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm6, %ymm13 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm12, %ymm21, %ymm12 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm19, %zmm12 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm11, %ymm17 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] @@ -9354,16 +8864,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] @@ -9376,7 +8880,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 448(%rax) diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index 541dfb54e96d2..cca9d4aa2a9f0 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -241,81 +241,77 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32 ; ; AVX2-LABEL: widen_ctpop_v2i32_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm5 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm7 +; AVX2-NEXT: vpand %xmm5, %xmm7, %xmm7 +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm8 +; AVX2-NEXT: vpand %xmm5, %xmm8, %xmm8 +; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm5 -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm0, %xmm4, %xmm0 +; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm2 +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-NEXT: vpaddb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: widen_ctpop_v2i32_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm5 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX512VL-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm7 +; AVX512VL-NEXT: vpand %xmm5, %xmm7, %xmm7 +; AVX512VL-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX512VL-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512VL-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512VL-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512VL-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm2, %xmm8 +; AVX512VL-NEXT: vpand %xmm5, %xmm8, %xmm8 +; AVX512VL-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-NEXT: vpaddb %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vpsrlw $4, %xmm3, %xmm2 +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512VL-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm5 -; AVX512VL-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512VL-NEXT: vpsrlw $4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX512VL-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512VL-NEXT: vpaddb %xmm5, %xmm3, %xmm3 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1304,47 +1300,45 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm5 ; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm6 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm5 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpaddb %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm6 -; AVX2-NEXT: vpandn %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm8 +; AVX2-NEXT: vpandn %xmm8, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm8 +; AVX2-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vpaddb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpandn %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm6 -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm9 +; AVX2-NEXT: vpandn %xmm9, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm9 +; AVX2-NEXT: vpand %xmm6, %xmm9, %xmm9 +; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpaddb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm4 -; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm4 -; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3 +; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpaddb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm8[0],xmm3[0] +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1649,47 +1643,45 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm5 ; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm6 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm5 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpaddb %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm6 -; AVX2-NEXT: vpandn %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm8 +; AVX2-NEXT: vpandn %xmm8, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm8 +; AVX2-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vpaddb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpandn %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm6 -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm9 +; AVX2-NEXT: vpandn %xmm9, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm9 +; AVX2-NEXT: vpand %xmm6, %xmm9, %xmm9 +; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpaddb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm4 -; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm4 -; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3 +; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpaddb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm8[0],xmm3[0] +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; From 8aff59d3f4e53751b23cd3bc22a74f8677c57d7d Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Mon, 17 Feb 2025 11:22:08 -0500 Subject: [PATCH 031/127] [NFC][AMDGPU] Auto generate check lines for three test cases (#127352) - `CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll` - `CodeGen/AMDGPU/call-preserved-registers.ll` - `CodeGen/AMDGPU/stack-realign.ll` This is to make preparation for another PR. --- .../AMDGPU/call-preserved-registers.ll | 823 +++++++++++++----- .../spill_more_than_wavesize_csr_sgprs.ll | 319 ++++++- llvm/test/CodeGen/AMDGPU/stack-realign.ll | 813 +++++++++++++---- 3 files changed, 1569 insertions(+), 386 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index ff80e05197b0d..db9ce56ecc3cc 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s @@ -5,110 +6,258 @@ declare hidden void @external_void_func_void() #3 -; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_getpc_b64 s[34:35] -; GCN-NEXT: s_add_u32 s34, s34, -; GCN-NEXT: s_addc_u32 s35, s35, -; GCN: s_swappc_b64 s[30:31], s[34:35] - -; GCN-NEXT: #ASMSTART -; GCN-NEXT: #ASMEND -; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35] define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; FLATSCR-LABEL: test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: s_endpgm call void @external_void_func_void() call void asm sideeffect "", ""() #0 call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; MUBUF: buffer_store_dword -; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 -; GCN: v_writelane_b32 v40, s30, 0 -; GCN: v_writelane_b32 v40, s31, 1 -; GCN: v_writelane_b32 v40, s34, 2 -; GCN: v_writelane_b32 v40, s35, 3 - -; GCN: s_swappc_b64 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s35, v40, 3 -; GCN: v_readlane_b32 s34, v40, 2 -; MUBUF-DAG: v_readlane_b32 s31, v40, 1 -; MUBUF-DAG: v_readlane_b32 s30, v40, 0 -; FLATSCR-DAG: v_readlane_b32 s31, v40, 1 -; FLATSCR-DAG: v_readlane_b32 s30, v40, 0 - -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; MUBUF: buffer_load_dword -; FLATSCR: scratch_load_dword -; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; MUBUF-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 3 +; MUBUF-NEXT: s_getpc_b64 s[34:35] +; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: v_readlane_b32 s35, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() call void asm sideeffect "", ""() #0 call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: buffer_store_dword v40 -; FLATSCR: scratch_store_dword off, v40 -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 - -; GCN: s_swappc_b64 -; GCN-NEXT: s_swappc_b64 - -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; MUBUF: buffer_load_dword v40 -; FLATSCR: scratch_load_dword v40 -; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] define void @test_func_call_external_void_funcx2() #0 { +; MUBUF-LABEL: test_func_call_external_void_funcx2: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 3 +; MUBUF-NEXT: s_getpc_b64 s[34:35] +; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: v_readlane_b32 s35, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: test_func_call_external_void_funcx2: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31: -; GCN: s_waitcnt -; GCN: v_writelane_b32 v0, s30, 0 -; GCN: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: #ASMSTART -; GCN: ; clobber -; GCN-NEXT: #ASMEND -; GCN: v_readlane_b32 s31, v0, 1 -; GCN: v_readlane_b32 s30, v0, 0 -; GCN: s_setpc_b64 s[30:31] define void @void_func_void_clobber_s30_s31() #2 { +; MUBUF-LABEL: void_func_void_clobber_s30_s31: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v0, s31, 1 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v0, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s30_s31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v0, s31, 1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_vcc: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_setpc_b64 s[30:31] define hidden void @void_func_void_clobber_vcc() #2 { +; GCN-LABEL: void_func_void_clobber_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{vcc}"() #0 ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b64 s[34:35], vcc -; GCN-NEXT: s_swappc_b64 -; GCN: s_mov_b64 vcc, s[34:35] define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_vcc: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_add_u32 s8, s4, 8 +; FLATSCR-NEXT: s_addc_u32 s9, s5, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_vcc@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_vcc@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def vcc +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b64 s[34:35], vcc +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b64 vcc, s[34:35] +; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 +; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use vcc +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def $0", "={vcc}"() call void @void_func_void_clobber_vcc() %val0 = load volatile i32, ptr addrspace(1) undef @@ -117,22 +266,50 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: -; GCN: s_mov_b32 s33, s31 -; GCN: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s31, s33 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_mayclobber_s31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s31 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: s_mov_b32 s31, s33 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s31 = call i32 asm sideeffect "; def $0", "={s31}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s31}"(i32 %s31) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: -; GCN: v_mov_b32_e32 v40, v31 -; GCN: s_swappc_b64 -; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_mayclobber_v31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_mov_b32_e32 v40, v31 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_mov_b32_e32 v31, v40 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %v31 = call i32 asm sideeffect "; def $0", "={v31}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{v31}"(i32 %v31) @@ -140,175 +317,294 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace } ; FIXME: What is the expected behavior for reserved registers here? - -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 - -; GCN: #ASMSTART -; GCN-NEXT: ; def s33 -; GCN-NEXT: #ASMEND - -; GCN-NOT: s33 - -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] -; MUBUF: s_swappc_b64 s[30:31], s[4:5] - -; GCN-NOT: s33 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; use s33 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s33 = call i32 asm sideeffect "; def $0", "={s33}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s33}"(i32 %s33) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} -; GCN-NOT: s34 - -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GCN: s_mov_b32 s32, 0 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def s34 -; GCN-NEXT: ;;#ASMEND - -; GCN-NOT: s34 - -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] - -; GCN-NOT: s34 - -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use s34 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s34(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s34 = call i32 asm sideeffect "; def $0", "={s34}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s34}"(i32 %s34) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} - -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GCN: s_mov_b32 s32, 0 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def v40 -; GCN-NEXT: ;;#ASMEND - -; GCN-NOT: v40 - -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] - -; GCN-NOT: v40 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; use v40 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_v40(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_v40: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %v40 = call i32 asm sideeffect "; def $0", "={v40}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{v40}"(i32 %v40) ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s33: -; GCN: v_writelane_b32 v0, s33, 0 -; GCN-NEXT: #ASMSTART -; GCN-NEXT: ; clobber -; GCN-NEXT: #ASMEND -; GCN-NEXT: v_readlane_b32 s33, v0, 0 -; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s33() #2 { +; MUBUF-LABEL: void_func_void_clobber_s33: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s33, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s33, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s33, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s33, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s34: -; GCN: v_writelane_b32 v0, s34, 0 -; GCN-NEXT: #ASMSTART -; GCN-NEXT: ; clobber -; GCN-NEXT: #ASMEND -; GCN-NEXT: v_readlane_b32 s34, v0, 0 -; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s34() #2 { +; MUBUF-LABEL: void_func_void_clobber_s34: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s34, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s34, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s34, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s34, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s33@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s33@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s33() ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s34@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s34@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s34() ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_func: -; GCN-NOT: s40 -; GCN: v_writelane_b32 v40, s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v40 -; GCN-NOT: s40 define void @callee_saved_sgpr_func() #2 { +; MUBUF-LABEL: callee_saved_sgpr_func: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 3 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s40, 2 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s40, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 3 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_saved_sgpr_func: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 3 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s40, 2 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s40, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 3 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_kernel: -; GCN-NOT: s40 -; GCN: ; def s40 -; GCN-NOT: s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { +; FLATSCR-LABEL: callee_saved_sgpr_kernel: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 @@ -316,16 +612,92 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { } ; First call preserved VGPR is used so it can't be used for SGPR spills. -; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func: -; GCN-NOT: s40 -; GCN: v_writelane_b32 v41, s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v41 -; GCN-NOT: s40 define void @callee_saved_sgpr_vgpr_func() #2 { +; MUBUF-LABEL: callee_saved_sgpr_vgpr_func: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v41, s4, 3 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v41, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v41, s31, 1 +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v41, s40, 2 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: v_readlane_b32 s40, v41, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v41, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v41, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 3 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_saved_sgpr_vgpr_func: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v41, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v41, s31, 1 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: v_writelane_b32 v41, s40, 2 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: v_readlane_b32 s40, v41, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v41, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v41, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v41, 3 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0 call void @external_void_func_void() @@ -334,15 +706,30 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel: -; GCN-NOT: s40 -; GCN: ; def s40 -; GCN-NOT: s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 { +; FLATSCR-LABEL: callee_saved_sgpr_vgpr_kernel: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v32 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_mov_b32_e32 v40, v32 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll index d2b960fe43f84..0d6bccad89d82 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -1,13 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs: -; CHECK-DAG: v_writelane_b32 v0, s98, 63 -; CHECK-DAG: v_writelane_b32 v1, s99, 0 -; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v1, 0 -; CHECK-DAG: v_readlane_b32 s98, v0, 63 - define void @spill_more_than_wavesize_csr_sgprs() { +; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v0, s35, 0 +; CHECK-NEXT: v_writelane_b32 v0, s36, 1 +; CHECK-NEXT: v_writelane_b32 v0, s37, 2 +; CHECK-NEXT: v_writelane_b32 v0, s38, 3 +; CHECK-NEXT: v_writelane_b32 v0, s39, 4 +; CHECK-NEXT: v_writelane_b32 v0, s40, 5 +; CHECK-NEXT: v_writelane_b32 v0, s41, 6 +; CHECK-NEXT: v_writelane_b32 v0, s42, 7 +; CHECK-NEXT: v_writelane_b32 v0, s43, 8 +; CHECK-NEXT: v_writelane_b32 v0, s44, 9 +; CHECK-NEXT: v_writelane_b32 v0, s45, 10 +; CHECK-NEXT: v_writelane_b32 v0, s46, 11 +; CHECK-NEXT: v_writelane_b32 v0, s47, 12 +; CHECK-NEXT: v_writelane_b32 v0, s48, 13 +; CHECK-NEXT: v_writelane_b32 v0, s49, 14 +; CHECK-NEXT: v_writelane_b32 v0, s50, 15 +; CHECK-NEXT: v_writelane_b32 v0, s51, 16 +; CHECK-NEXT: v_writelane_b32 v0, s52, 17 +; CHECK-NEXT: v_writelane_b32 v0, s53, 18 +; CHECK-NEXT: v_writelane_b32 v0, s54, 19 +; CHECK-NEXT: v_writelane_b32 v0, s55, 20 +; CHECK-NEXT: v_writelane_b32 v0, s56, 21 +; CHECK-NEXT: v_writelane_b32 v0, s57, 22 +; CHECK-NEXT: v_writelane_b32 v0, s58, 23 +; CHECK-NEXT: v_writelane_b32 v0, s59, 24 +; CHECK-NEXT: v_writelane_b32 v0, s60, 25 +; CHECK-NEXT: v_writelane_b32 v0, s61, 26 +; CHECK-NEXT: v_writelane_b32 v0, s62, 27 +; CHECK-NEXT: v_writelane_b32 v0, s63, 28 +; CHECK-NEXT: v_writelane_b32 v0, s64, 29 +; CHECK-NEXT: v_writelane_b32 v0, s65, 30 +; CHECK-NEXT: v_writelane_b32 v0, s66, 31 +; CHECK-NEXT: v_writelane_b32 v0, s67, 32 +; CHECK-NEXT: v_writelane_b32 v0, s68, 33 +; CHECK-NEXT: v_writelane_b32 v0, s69, 34 +; CHECK-NEXT: v_writelane_b32 v0, s70, 35 +; CHECK-NEXT: v_writelane_b32 v0, s71, 36 +; CHECK-NEXT: v_writelane_b32 v0, s72, 37 +; CHECK-NEXT: v_writelane_b32 v0, s73, 38 +; CHECK-NEXT: v_writelane_b32 v0, s74, 39 +; CHECK-NEXT: v_writelane_b32 v0, s75, 40 +; CHECK-NEXT: v_writelane_b32 v0, s76, 41 +; CHECK-NEXT: v_writelane_b32 v0, s77, 42 +; CHECK-NEXT: v_writelane_b32 v0, s78, 43 +; CHECK-NEXT: v_writelane_b32 v0, s79, 44 +; CHECK-NEXT: v_writelane_b32 v0, s80, 45 +; CHECK-NEXT: v_writelane_b32 v0, s81, 46 +; CHECK-NEXT: v_writelane_b32 v0, s82, 47 +; CHECK-NEXT: v_writelane_b32 v0, s83, 48 +; CHECK-NEXT: v_writelane_b32 v0, s84, 49 +; CHECK-NEXT: v_writelane_b32 v0, s85, 50 +; CHECK-NEXT: v_writelane_b32 v0, s86, 51 +; CHECK-NEXT: v_writelane_b32 v0, s87, 52 +; CHECK-NEXT: v_writelane_b32 v0, s88, 53 +; CHECK-NEXT: v_writelane_b32 v0, s89, 54 +; CHECK-NEXT: v_writelane_b32 v0, s90, 55 +; CHECK-NEXT: v_writelane_b32 v0, s91, 56 +; CHECK-NEXT: v_writelane_b32 v0, s92, 57 +; CHECK-NEXT: v_writelane_b32 v0, s93, 58 +; CHECK-NEXT: v_writelane_b32 v0, s94, 59 +; CHECK-NEXT: v_writelane_b32 v0, s95, 60 +; CHECK-NEXT: v_writelane_b32 v1, s99, 0 +; CHECK-NEXT: v_writelane_b32 v0, s96, 61 +; CHECK-NEXT: v_writelane_b32 v1, s100, 1 +; CHECK-NEXT: v_writelane_b32 v0, s97, 62 +; CHECK-NEXT: v_writelane_b32 v1, s101, 2 +; CHECK-NEXT: v_writelane_b32 v0, s98, 63 +; CHECK-NEXT: v_writelane_b32 v1, s102, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s102, v1, 3 +; CHECK-NEXT: v_readlane_b32 s101, v1, 2 +; CHECK-NEXT: v_readlane_b32 s100, v1, 1 +; CHECK-NEXT: v_readlane_b32 s99, v1, 0 +; CHECK-NEXT: v_readlane_b32 s98, v0, 63 +; CHECK-NEXT: v_readlane_b32 s97, v0, 62 +; CHECK-NEXT: v_readlane_b32 s96, v0, 61 +; CHECK-NEXT: v_readlane_b32 s95, v0, 60 +; CHECK-NEXT: v_readlane_b32 s94, v0, 59 +; CHECK-NEXT: v_readlane_b32 s93, v0, 58 +; CHECK-NEXT: v_readlane_b32 s92, v0, 57 +; CHECK-NEXT: v_readlane_b32 s91, v0, 56 +; CHECK-NEXT: v_readlane_b32 s90, v0, 55 +; CHECK-NEXT: v_readlane_b32 s89, v0, 54 +; CHECK-NEXT: v_readlane_b32 s88, v0, 53 +; CHECK-NEXT: v_readlane_b32 s87, v0, 52 +; CHECK-NEXT: v_readlane_b32 s86, v0, 51 +; CHECK-NEXT: v_readlane_b32 s85, v0, 50 +; CHECK-NEXT: v_readlane_b32 s84, v0, 49 +; CHECK-NEXT: v_readlane_b32 s83, v0, 48 +; CHECK-NEXT: v_readlane_b32 s82, v0, 47 +; CHECK-NEXT: v_readlane_b32 s81, v0, 46 +; CHECK-NEXT: v_readlane_b32 s80, v0, 45 +; CHECK-NEXT: v_readlane_b32 s79, v0, 44 +; CHECK-NEXT: v_readlane_b32 s78, v0, 43 +; CHECK-NEXT: v_readlane_b32 s77, v0, 42 +; CHECK-NEXT: v_readlane_b32 s76, v0, 41 +; CHECK-NEXT: v_readlane_b32 s75, v0, 40 +; CHECK-NEXT: v_readlane_b32 s74, v0, 39 +; CHECK-NEXT: v_readlane_b32 s73, v0, 38 +; CHECK-NEXT: v_readlane_b32 s72, v0, 37 +; CHECK-NEXT: v_readlane_b32 s71, v0, 36 +; CHECK-NEXT: v_readlane_b32 s70, v0, 35 +; CHECK-NEXT: v_readlane_b32 s69, v0, 34 +; CHECK-NEXT: v_readlane_b32 s68, v0, 33 +; CHECK-NEXT: v_readlane_b32 s67, v0, 32 +; CHECK-NEXT: v_readlane_b32 s66, v0, 31 +; CHECK-NEXT: v_readlane_b32 s65, v0, 30 +; CHECK-NEXT: v_readlane_b32 s64, v0, 29 +; CHECK-NEXT: v_readlane_b32 s63, v0, 28 +; CHECK-NEXT: v_readlane_b32 s62, v0, 27 +; CHECK-NEXT: v_readlane_b32 s61, v0, 26 +; CHECK-NEXT: v_readlane_b32 s60, v0, 25 +; CHECK-NEXT: v_readlane_b32 s59, v0, 24 +; CHECK-NEXT: v_readlane_b32 s58, v0, 23 +; CHECK-NEXT: v_readlane_b32 s57, v0, 22 +; CHECK-NEXT: v_readlane_b32 s56, v0, 21 +; CHECK-NEXT: v_readlane_b32 s55, v0, 20 +; CHECK-NEXT: v_readlane_b32 s54, v0, 19 +; CHECK-NEXT: v_readlane_b32 s53, v0, 18 +; CHECK-NEXT: v_readlane_b32 s52, v0, 17 +; CHECK-NEXT: v_readlane_b32 s51, v0, 16 +; CHECK-NEXT: v_readlane_b32 s50, v0, 15 +; CHECK-NEXT: v_readlane_b32 s49, v0, 14 +; CHECK-NEXT: v_readlane_b32 s48, v0, 13 +; CHECK-NEXT: v_readlane_b32 s47, v0, 12 +; CHECK-NEXT: v_readlane_b32 s46, v0, 11 +; CHECK-NEXT: v_readlane_b32 s45, v0, 10 +; CHECK-NEXT: v_readlane_b32 s44, v0, 9 +; CHECK-NEXT: v_readlane_b32 s43, v0, 8 +; CHECK-NEXT: v_readlane_b32 s42, v0, 7 +; CHECK-NEXT: v_readlane_b32 s41, v0, 6 +; CHECK-NEXT: v_readlane_b32 s40, v0, 5 +; CHECK-NEXT: v_readlane_b32 s39, v0, 4 +; CHECK-NEXT: v_readlane_b32 s38, v0, 3 +; CHECK-NEXT: v_readlane_b32 s37, v0, 2 +; CHECK-NEXT: v_readlane_b32 s36, v0, 1 +; CHECK-NEXT: v_readlane_b32 s35, v0, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35},~{s36},~{s37},~{s38},~{s39},~{s40},~{s41},~{s42} ,~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49},~{s50} @@ -21,13 +166,161 @@ define void @spill_more_than_wavesize_csr_sgprs() { ret void } -; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object: -; CHECK-DAG: v_writelane_b32 v1, s98, 63 -; CHECK-DAG: v_writelane_b32 v2, s99, 0 -; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v2, 0 -; CHECK-DAG: v_readlane_b32 s98, v1, 63 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { +; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs_with_stack_object: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v1, s35, 0 +; CHECK-NEXT: v_writelane_b32 v1, s36, 1 +; CHECK-NEXT: v_writelane_b32 v1, s37, 2 +; CHECK-NEXT: v_writelane_b32 v1, s38, 3 +; CHECK-NEXT: v_writelane_b32 v1, s39, 4 +; CHECK-NEXT: v_writelane_b32 v1, s40, 5 +; CHECK-NEXT: v_writelane_b32 v1, s41, 6 +; CHECK-NEXT: v_writelane_b32 v1, s42, 7 +; CHECK-NEXT: v_writelane_b32 v1, s43, 8 +; CHECK-NEXT: v_writelane_b32 v1, s44, 9 +; CHECK-NEXT: v_writelane_b32 v1, s45, 10 +; CHECK-NEXT: v_writelane_b32 v1, s46, 11 +; CHECK-NEXT: v_writelane_b32 v1, s47, 12 +; CHECK-NEXT: v_writelane_b32 v1, s48, 13 +; CHECK-NEXT: v_writelane_b32 v1, s49, 14 +; CHECK-NEXT: v_writelane_b32 v1, s50, 15 +; CHECK-NEXT: v_writelane_b32 v1, s51, 16 +; CHECK-NEXT: v_writelane_b32 v1, s52, 17 +; CHECK-NEXT: v_writelane_b32 v1, s53, 18 +; CHECK-NEXT: v_writelane_b32 v1, s54, 19 +; CHECK-NEXT: v_writelane_b32 v1, s55, 20 +; CHECK-NEXT: v_writelane_b32 v1, s56, 21 +; CHECK-NEXT: v_writelane_b32 v1, s57, 22 +; CHECK-NEXT: v_writelane_b32 v1, s58, 23 +; CHECK-NEXT: v_writelane_b32 v1, s59, 24 +; CHECK-NEXT: v_writelane_b32 v1, s60, 25 +; CHECK-NEXT: v_writelane_b32 v1, s61, 26 +; CHECK-NEXT: v_writelane_b32 v1, s62, 27 +; CHECK-NEXT: v_writelane_b32 v1, s63, 28 +; CHECK-NEXT: v_writelane_b32 v1, s64, 29 +; CHECK-NEXT: v_writelane_b32 v1, s65, 30 +; CHECK-NEXT: v_writelane_b32 v1, s66, 31 +; CHECK-NEXT: v_writelane_b32 v1, s67, 32 +; CHECK-NEXT: v_writelane_b32 v1, s68, 33 +; CHECK-NEXT: v_writelane_b32 v1, s69, 34 +; CHECK-NEXT: v_writelane_b32 v1, s70, 35 +; CHECK-NEXT: v_writelane_b32 v1, s71, 36 +; CHECK-NEXT: v_writelane_b32 v1, s72, 37 +; CHECK-NEXT: v_writelane_b32 v1, s73, 38 +; CHECK-NEXT: v_writelane_b32 v1, s74, 39 +; CHECK-NEXT: v_writelane_b32 v1, s75, 40 +; CHECK-NEXT: v_writelane_b32 v1, s76, 41 +; CHECK-NEXT: v_writelane_b32 v1, s77, 42 +; CHECK-NEXT: v_writelane_b32 v1, s78, 43 +; CHECK-NEXT: v_writelane_b32 v1, s79, 44 +; CHECK-NEXT: v_writelane_b32 v1, s80, 45 +; CHECK-NEXT: v_writelane_b32 v1, s81, 46 +; CHECK-NEXT: v_writelane_b32 v1, s82, 47 +; CHECK-NEXT: v_writelane_b32 v1, s83, 48 +; CHECK-NEXT: v_writelane_b32 v1, s84, 49 +; CHECK-NEXT: v_writelane_b32 v1, s85, 50 +; CHECK-NEXT: v_writelane_b32 v1, s86, 51 +; CHECK-NEXT: v_writelane_b32 v1, s87, 52 +; CHECK-NEXT: v_writelane_b32 v1, s88, 53 +; CHECK-NEXT: v_writelane_b32 v1, s89, 54 +; CHECK-NEXT: v_writelane_b32 v1, s90, 55 +; CHECK-NEXT: v_writelane_b32 v1, s91, 56 +; CHECK-NEXT: v_writelane_b32 v1, s92, 57 +; CHECK-NEXT: v_writelane_b32 v1, s93, 58 +; CHECK-NEXT: v_writelane_b32 v1, s94, 59 +; CHECK-NEXT: v_writelane_b32 v1, s95, 60 +; CHECK-NEXT: v_writelane_b32 v2, s99, 0 +; CHECK-NEXT: v_writelane_b32 v1, s96, 61 +; CHECK-NEXT: v_writelane_b32 v2, s100, 1 +; CHECK-NEXT: v_writelane_b32 v1, s97, 62 +; CHECK-NEXT: v_writelane_b32 v2, s101, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_writelane_b32 v1, s98, 63 +; CHECK-NEXT: v_writelane_b32 v2, s102, 3 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s102, v2, 3 +; CHECK-NEXT: v_readlane_b32 s101, v2, 2 +; CHECK-NEXT: v_readlane_b32 s100, v2, 1 +; CHECK-NEXT: v_readlane_b32 s99, v2, 0 +; CHECK-NEXT: v_readlane_b32 s98, v1, 63 +; CHECK-NEXT: v_readlane_b32 s97, v1, 62 +; CHECK-NEXT: v_readlane_b32 s96, v1, 61 +; CHECK-NEXT: v_readlane_b32 s95, v1, 60 +; CHECK-NEXT: v_readlane_b32 s94, v1, 59 +; CHECK-NEXT: v_readlane_b32 s93, v1, 58 +; CHECK-NEXT: v_readlane_b32 s92, v1, 57 +; CHECK-NEXT: v_readlane_b32 s91, v1, 56 +; CHECK-NEXT: v_readlane_b32 s90, v1, 55 +; CHECK-NEXT: v_readlane_b32 s89, v1, 54 +; CHECK-NEXT: v_readlane_b32 s88, v1, 53 +; CHECK-NEXT: v_readlane_b32 s87, v1, 52 +; CHECK-NEXT: v_readlane_b32 s86, v1, 51 +; CHECK-NEXT: v_readlane_b32 s85, v1, 50 +; CHECK-NEXT: v_readlane_b32 s84, v1, 49 +; CHECK-NEXT: v_readlane_b32 s83, v1, 48 +; CHECK-NEXT: v_readlane_b32 s82, v1, 47 +; CHECK-NEXT: v_readlane_b32 s81, v1, 46 +; CHECK-NEXT: v_readlane_b32 s80, v1, 45 +; CHECK-NEXT: v_readlane_b32 s79, v1, 44 +; CHECK-NEXT: v_readlane_b32 s78, v1, 43 +; CHECK-NEXT: v_readlane_b32 s77, v1, 42 +; CHECK-NEXT: v_readlane_b32 s76, v1, 41 +; CHECK-NEXT: v_readlane_b32 s75, v1, 40 +; CHECK-NEXT: v_readlane_b32 s74, v1, 39 +; CHECK-NEXT: v_readlane_b32 s73, v1, 38 +; CHECK-NEXT: v_readlane_b32 s72, v1, 37 +; CHECK-NEXT: v_readlane_b32 s71, v1, 36 +; CHECK-NEXT: v_readlane_b32 s70, v1, 35 +; CHECK-NEXT: v_readlane_b32 s69, v1, 34 +; CHECK-NEXT: v_readlane_b32 s68, v1, 33 +; CHECK-NEXT: v_readlane_b32 s67, v1, 32 +; CHECK-NEXT: v_readlane_b32 s66, v1, 31 +; CHECK-NEXT: v_readlane_b32 s65, v1, 30 +; CHECK-NEXT: v_readlane_b32 s64, v1, 29 +; CHECK-NEXT: v_readlane_b32 s63, v1, 28 +; CHECK-NEXT: v_readlane_b32 s62, v1, 27 +; CHECK-NEXT: v_readlane_b32 s61, v1, 26 +; CHECK-NEXT: v_readlane_b32 s60, v1, 25 +; CHECK-NEXT: v_readlane_b32 s59, v1, 24 +; CHECK-NEXT: v_readlane_b32 s58, v1, 23 +; CHECK-NEXT: v_readlane_b32 s57, v1, 22 +; CHECK-NEXT: v_readlane_b32 s56, v1, 21 +; CHECK-NEXT: v_readlane_b32 s55, v1, 20 +; CHECK-NEXT: v_readlane_b32 s54, v1, 19 +; CHECK-NEXT: v_readlane_b32 s53, v1, 18 +; CHECK-NEXT: v_readlane_b32 s52, v1, 17 +; CHECK-NEXT: v_readlane_b32 s51, v1, 16 +; CHECK-NEXT: v_readlane_b32 s50, v1, 15 +; CHECK-NEXT: v_readlane_b32 s49, v1, 14 +; CHECK-NEXT: v_readlane_b32 s48, v1, 13 +; CHECK-NEXT: v_readlane_b32 s47, v1, 12 +; CHECK-NEXT: v_readlane_b32 s46, v1, 11 +; CHECK-NEXT: v_readlane_b32 s45, v1, 10 +; CHECK-NEXT: v_readlane_b32 s44, v1, 9 +; CHECK-NEXT: v_readlane_b32 s43, v1, 8 +; CHECK-NEXT: v_readlane_b32 s42, v1, 7 +; CHECK-NEXT: v_readlane_b32 s41, v1, 6 +; CHECK-NEXT: v_readlane_b32 s40, v1, 5 +; CHECK-NEXT: v_readlane_b32 s39, v1, 4 +; CHECK-NEXT: v_readlane_b32 s38, v1, 3 +; CHECK-NEXT: v_readlane_b32 s37, v1, 2 +; CHECK-NEXT: v_readlane_b32 s36, v1, 1 +; CHECK-NEXT: v_readlane_b32 s35, v1, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "", diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index fed60eecc8a8b..0e568e3071e99 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -8,92 +8,168 @@ ; 4 byte emergency stack slot ; = 144 bytes with padding between them -; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0 -; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, s32 -; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[SCALED_IDX]], [[FRAMEDIFF]] - -; GCN-NOT: s32 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN-NOT: s32 - -; GCN: ; ScratchSize: 144 define void @needs_align16_default_stack_align(i32 %idx) #0 { +; GCN-LABEL: needs_align16_default_stack_align: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s32 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 144 %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } -; GCN-LABEL: {{^}}needs_align16_stack_align4: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffffc00 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0x2800{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { +; GCN-LABEL: needs_align16_stack_align4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x3c0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x2800 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 160 %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } -; GCN-LABEL: {{^}}needs_align32: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffff800 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0x3000{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { +; GCN-LABEL: needs_align32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x7c0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x3000 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 192 %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 32 ret void } -; GCN-LABEL: {{^}}force_realign4: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffffff00 -; GCN: s_addk_i32 s32, 0xd00{{$}} - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { +; GCN-LABEL: force_realign4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffff00 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0xd00 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 3 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 52 %alloca.align16 = alloca [8 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [8 x i32], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile i32 3, ptr addrspace(5) %gep0, align 4 ret void } -; GCN-LABEL: {{^}}kernel_call_align16_from_8: -; GCN: s_movk_i32 s32, 0x400{{$}} -; GCN-NOT: s32 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_8() #0 { +; GCN-LABEL: kernel_call_align16_from_8: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca = alloca i32, align 4, addrspace(5) store volatile i32 2, ptr addrspace(5) %alloca call void @needs_align16_default_stack_align(i32 1) @@ -101,10 +177,32 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 { } ; The call sequence should keep the stack on call aligned to 4 -; GCN-LABEL: {{^}}kernel_call_align16_from_5: -; GCN: s_movk_i32 s32, 0x400 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_5() { +; GCN-LABEL: kernel_call_align16_from_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca0 = alloca i8, align 1, addrspace(5) store volatile i8 2, ptr addrspace(5) %alloca0 @@ -112,10 +210,32 @@ define amdgpu_kernel void @kernel_call_align16_from_5() { ret void } -; GCN-LABEL: {{^}}kernel_call_align4_from_5: -; GCN: s_movk_i32 s32, 0x400 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align4_from_5() { +; GCN-LABEL: kernel_call_align4_from_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_stack_align4@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_stack_align4@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca0 = alloca i8, align 1, addrspace(5) store volatile i8 2, ptr addrspace(5) %alloca0 @@ -123,28 +243,36 @@ define amdgpu_kernel void @kernel_call_align4_from_5() { ret void } -; GCN-LABEL: {{^}}default_realign_align128: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_mov_b32 s5, s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_addk_i32 s32, 0x4000 -; GCN-NOT: s33 -; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} -; GCN: s_mov_b32 s32, s34 -; GCN: s_mov_b32 s33, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { +; GCN-LABEL: default_realign_align128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x4000 +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void } -; GCN-LABEL: {{^}}disable_realign_align128: -; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN-NOT: s32 define void @disable_realign_align128(i32 %idx) #3 { +; GCN-LABEL: disable_realign_align128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void @@ -156,35 +284,48 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; since there is a local object with an alignment of 1024. ; Should use BP to access the incoming stack arguments. ; The BP value is saved/restored with a VGPR spill. - ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 -; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 -; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 -; GCN: s_mov_b32 s34, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN-DAG: s_add_i32 s32, s32, 0x30000 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN: s_swappc_b64 s[30:31], +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_writelane_b32 v40, s34, 3 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_add_i32 s32, s32, 0x30000 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 3 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 -; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 -; GCN-NEXT: s_mov_b32 s32, s34 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 1024 call void @extern_func(<32 x i32> %a, i32 %b) @@ -198,23 +339,56 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu ; index variable, the base pointer first get loaded into a VGPR ; and that value should be further referenced to load the incoming values. ; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue. - ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 -; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 -; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34 -; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GCN: s_add_i32 s32, s32, 0x30000 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 -; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen -; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] -; GCN: s_mov_b32 s32, s34 -; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; %bb.0: ; %begin +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s11, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s34 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_add_i32 s32, s32, 0x30000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NEXT: s_branch .LBB10_2 +; GCN-NEXT: .LBB10_1: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %loop_body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s10, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_cbranch_execz .LBB10_1 +; GCN-NEXT: ; %bb.3: ; %loop_end +; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 +; GCN-NEXT: s_add_i32 s10, s10, 1 +; GCN-NEXT: s_cmp_eq_u32 s10, 9 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GCN-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; GCN-NEXT: s_branch .LBB10_1 +; GCN-NEXT: .LBB10_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s33, s11 +; GCN-NEXT: s_setpc_b64 s[30:31] begin: %local_var = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %local_var, align 1024 @@ -239,16 +413,31 @@ exit: ; preds = %loop_end, %loop_b define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { ; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy: -; GCN: ; %bb.0: -; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN: s_setpc_b64 s[30:31] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 vcc_lo, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v1, s34, 0 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s34, v1, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, vcc_lo +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 ; Use all clobberable registers, so BP has to spill to a VGPR. @@ -262,15 +451,172 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ; If there are no free SGPRs or VGPRs available we must spill the BP to memory. - -; GCN-LABEL: no_free_regs_spill_bp_to_mem -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_xor_saveexec_b64 s[6:7], -1 -; GCN: buffer_store_dword v39, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN: buffer_store_dword v0, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, s34 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33 +; GCN-LABEL: no_free_regs_spill_bp_to_memory: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: v_writelane_b32 v39, s39, 0 +; GCN-NEXT: v_writelane_b32 v39, s40, 1 +; GCN-NEXT: v_writelane_b32 v39, s41, 2 +; GCN-NEXT: v_writelane_b32 v39, s42, 3 +; GCN-NEXT: v_writelane_b32 v39, s43, 4 +; GCN-NEXT: v_writelane_b32 v39, s44, 5 +; GCN-NEXT: v_writelane_b32 v39, s45, 6 +; GCN-NEXT: v_writelane_b32 v39, s46, 7 +; GCN-NEXT: v_writelane_b32 v39, s47, 8 +; GCN-NEXT: v_writelane_b32 v39, s48, 9 +; GCN-NEXT: v_writelane_b32 v39, s49, 10 +; GCN-NEXT: v_writelane_b32 v39, s50, 11 +; GCN-NEXT: v_writelane_b32 v39, s51, 12 +; GCN-NEXT: v_writelane_b32 v39, s52, 13 +; GCN-NEXT: v_writelane_b32 v39, s53, 14 +; GCN-NEXT: v_writelane_b32 v39, s54, 15 +; GCN-NEXT: v_writelane_b32 v39, s55, 16 +; GCN-NEXT: v_writelane_b32 v39, s56, 17 +; GCN-NEXT: v_writelane_b32 v39, s57, 18 +; GCN-NEXT: v_writelane_b32 v39, s58, 19 +; GCN-NEXT: v_writelane_b32 v39, s59, 20 +; GCN-NEXT: v_writelane_b32 v39, s60, 21 +; GCN-NEXT: v_writelane_b32 v39, s61, 22 +; GCN-NEXT: v_writelane_b32 v39, s62, 23 +; GCN-NEXT: v_writelane_b32 v39, s63, 24 +; GCN-NEXT: v_writelane_b32 v39, s64, 25 +; GCN-NEXT: v_writelane_b32 v39, s65, 26 +; GCN-NEXT: v_writelane_b32 v39, s66, 27 +; GCN-NEXT: v_writelane_b32 v39, s67, 28 +; GCN-NEXT: v_writelane_b32 v39, s68, 29 +; GCN-NEXT: v_writelane_b32 v39, s69, 30 +; GCN-NEXT: v_writelane_b32 v39, s70, 31 +; GCN-NEXT: v_writelane_b32 v39, s71, 32 +; GCN-NEXT: v_writelane_b32 v39, s72, 33 +; GCN-NEXT: v_writelane_b32 v39, s73, 34 +; GCN-NEXT: v_writelane_b32 v39, s74, 35 +; GCN-NEXT: v_writelane_b32 v39, s75, 36 +; GCN-NEXT: v_writelane_b32 v39, s76, 37 +; GCN-NEXT: v_writelane_b32 v39, s77, 38 +; GCN-NEXT: v_writelane_b32 v39, s78, 39 +; GCN-NEXT: v_writelane_b32 v39, s79, 40 +; GCN-NEXT: v_writelane_b32 v39, s80, 41 +; GCN-NEXT: v_writelane_b32 v39, s81, 42 +; GCN-NEXT: v_writelane_b32 v39, s82, 43 +; GCN-NEXT: v_writelane_b32 v39, s83, 44 +; GCN-NEXT: v_writelane_b32 v39, s84, 45 +; GCN-NEXT: v_writelane_b32 v39, s85, 46 +; GCN-NEXT: v_writelane_b32 v39, s86, 47 +; GCN-NEXT: v_writelane_b32 v39, s87, 48 +; GCN-NEXT: v_writelane_b32 v39, s88, 49 +; GCN-NEXT: v_writelane_b32 v39, s89, 50 +; GCN-NEXT: v_writelane_b32 v39, s90, 51 +; GCN-NEXT: v_writelane_b32 v39, s91, 52 +; GCN-NEXT: v_writelane_b32 v39, s92, 53 +; GCN-NEXT: v_writelane_b32 v39, s93, 54 +; GCN-NEXT: v_writelane_b32 v39, s94, 55 +; GCN-NEXT: v_writelane_b32 v39, s95, 56 +; GCN-NEXT: v_writelane_b32 v39, s96, 57 +; GCN-NEXT: v_writelane_b32 v39, s97, 58 +; GCN-NEXT: v_writelane_b32 v39, s98, 59 +; GCN-NEXT: v_writelane_b32 v39, s99, 60 +; GCN-NEXT: v_writelane_b32 v39, s100, 61 +; GCN-NEXT: v_writelane_b32 v39, s101, 62 +; GCN-NEXT: v_writelane_b32 v39, s102, 63 +; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber all VGPRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s102, v39, 63 +; GCN-NEXT: v_readlane_b32 s101, v39, 62 +; GCN-NEXT: v_readlane_b32 s100, v39, 61 +; GCN-NEXT: v_readlane_b32 s99, v39, 60 +; GCN-NEXT: v_readlane_b32 s98, v39, 59 +; GCN-NEXT: v_readlane_b32 s97, v39, 58 +; GCN-NEXT: v_readlane_b32 s96, v39, 57 +; GCN-NEXT: v_readlane_b32 s95, v39, 56 +; GCN-NEXT: v_readlane_b32 s94, v39, 55 +; GCN-NEXT: v_readlane_b32 s93, v39, 54 +; GCN-NEXT: v_readlane_b32 s92, v39, 53 +; GCN-NEXT: v_readlane_b32 s91, v39, 52 +; GCN-NEXT: v_readlane_b32 s90, v39, 51 +; GCN-NEXT: v_readlane_b32 s89, v39, 50 +; GCN-NEXT: v_readlane_b32 s88, v39, 49 +; GCN-NEXT: v_readlane_b32 s87, v39, 48 +; GCN-NEXT: v_readlane_b32 s86, v39, 47 +; GCN-NEXT: v_readlane_b32 s85, v39, 46 +; GCN-NEXT: v_readlane_b32 s84, v39, 45 +; GCN-NEXT: v_readlane_b32 s83, v39, 44 +; GCN-NEXT: v_readlane_b32 s82, v39, 43 +; GCN-NEXT: v_readlane_b32 s81, v39, 42 +; GCN-NEXT: v_readlane_b32 s80, v39, 41 +; GCN-NEXT: v_readlane_b32 s79, v39, 40 +; GCN-NEXT: v_readlane_b32 s78, v39, 39 +; GCN-NEXT: v_readlane_b32 s77, v39, 38 +; GCN-NEXT: v_readlane_b32 s76, v39, 37 +; GCN-NEXT: v_readlane_b32 s75, v39, 36 +; GCN-NEXT: v_readlane_b32 s74, v39, 35 +; GCN-NEXT: v_readlane_b32 s73, v39, 34 +; GCN-NEXT: v_readlane_b32 s72, v39, 33 +; GCN-NEXT: v_readlane_b32 s71, v39, 32 +; GCN-NEXT: v_readlane_b32 s70, v39, 31 +; GCN-NEXT: v_readlane_b32 s69, v39, 30 +; GCN-NEXT: v_readlane_b32 s68, v39, 29 +; GCN-NEXT: v_readlane_b32 s67, v39, 28 +; GCN-NEXT: v_readlane_b32 s66, v39, 27 +; GCN-NEXT: v_readlane_b32 s65, v39, 26 +; GCN-NEXT: v_readlane_b32 s64, v39, 25 +; GCN-NEXT: v_readlane_b32 s63, v39, 24 +; GCN-NEXT: v_readlane_b32 s62, v39, 23 +; GCN-NEXT: v_readlane_b32 s61, v39, 22 +; GCN-NEXT: v_readlane_b32 s60, v39, 21 +; GCN-NEXT: v_readlane_b32 s59, v39, 20 +; GCN-NEXT: v_readlane_b32 s58, v39, 19 +; GCN-NEXT: v_readlane_b32 s57, v39, 18 +; GCN-NEXT: v_readlane_b32 s56, v39, 17 +; GCN-NEXT: v_readlane_b32 s55, v39, 16 +; GCN-NEXT: v_readlane_b32 s54, v39, 15 +; GCN-NEXT: v_readlane_b32 s53, v39, 14 +; GCN-NEXT: v_readlane_b32 s52, v39, 13 +; GCN-NEXT: v_readlane_b32 s51, v39, 12 +; GCN-NEXT: v_readlane_b32 s50, v39, 11 +; GCN-NEXT: v_readlane_b32 s49, v39, 10 +; GCN-NEXT: v_readlane_b32 s48, v39, 9 +; GCN-NEXT: v_readlane_b32 s47, v39, 8 +; GCN-NEXT: v_readlane_b32 s46, v39, 7 +; GCN-NEXT: v_readlane_b32 s45, v39, 6 +; GCN-NEXT: v_readlane_b32 s44, v39, 5 +; GCN-NEXT: v_readlane_b32 s43, v39, 4 +; GCN-NEXT: v_readlane_b32 s42, v39, 3 +; GCN-NEXT: v_readlane_b32 s41, v39, 2 +; GCN-NEXT: v_readlane_b32 s40, v39, 1 +; GCN-NEXT: v_readlane_b32 s39, v39, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 @@ -297,22 +643,179 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i32 %b, ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #5 { ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. - -; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-NEXT: s_add_i32 s5, s33, 0x42100 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_add_i32 s5, s33, 0x42200 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NEXT: s_add_i32 s5, s33, 0x42300 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_add_i32 s5, s33, 0x42200 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: s_add_i32 s5, s33, 0x42300 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: v_writelane_b32 v39, s39, 0 +; GCN-NEXT: v_writelane_b32 v39, s40, 1 +; GCN-NEXT: v_writelane_b32 v39, s41, 2 +; GCN-NEXT: v_writelane_b32 v39, s42, 3 +; GCN-NEXT: v_writelane_b32 v39, s43, 4 +; GCN-NEXT: v_writelane_b32 v39, s44, 5 +; GCN-NEXT: v_writelane_b32 v39, s45, 6 +; GCN-NEXT: v_writelane_b32 v39, s46, 7 +; GCN-NEXT: v_writelane_b32 v39, s47, 8 +; GCN-NEXT: v_writelane_b32 v39, s48, 9 +; GCN-NEXT: v_writelane_b32 v39, s49, 10 +; GCN-NEXT: v_writelane_b32 v39, s50, 11 +; GCN-NEXT: v_writelane_b32 v39, s51, 12 +; GCN-NEXT: v_writelane_b32 v39, s52, 13 +; GCN-NEXT: v_writelane_b32 v39, s53, 14 +; GCN-NEXT: v_writelane_b32 v39, s54, 15 +; GCN-NEXT: v_writelane_b32 v39, s55, 16 +; GCN-NEXT: v_writelane_b32 v39, s56, 17 +; GCN-NEXT: v_writelane_b32 v39, s57, 18 +; GCN-NEXT: v_writelane_b32 v39, s58, 19 +; GCN-NEXT: v_writelane_b32 v39, s59, 20 +; GCN-NEXT: v_writelane_b32 v39, s60, 21 +; GCN-NEXT: v_writelane_b32 v39, s61, 22 +; GCN-NEXT: v_writelane_b32 v39, s62, 23 +; GCN-NEXT: v_writelane_b32 v39, s63, 24 +; GCN-NEXT: v_writelane_b32 v39, s64, 25 +; GCN-NEXT: v_writelane_b32 v39, s65, 26 +; GCN-NEXT: v_writelane_b32 v39, s66, 27 +; GCN-NEXT: v_writelane_b32 v39, s67, 28 +; GCN-NEXT: v_writelane_b32 v39, s68, 29 +; GCN-NEXT: v_writelane_b32 v39, s69, 30 +; GCN-NEXT: v_writelane_b32 v39, s70, 31 +; GCN-NEXT: v_writelane_b32 v39, s71, 32 +; GCN-NEXT: v_writelane_b32 v39, s72, 33 +; GCN-NEXT: v_writelane_b32 v39, s73, 34 +; GCN-NEXT: v_writelane_b32 v39, s74, 35 +; GCN-NEXT: v_writelane_b32 v39, s75, 36 +; GCN-NEXT: v_writelane_b32 v39, s76, 37 +; GCN-NEXT: v_writelane_b32 v39, s77, 38 +; GCN-NEXT: v_writelane_b32 v39, s78, 39 +; GCN-NEXT: v_writelane_b32 v39, s79, 40 +; GCN-NEXT: v_writelane_b32 v39, s80, 41 +; GCN-NEXT: v_writelane_b32 v39, s81, 42 +; GCN-NEXT: v_writelane_b32 v39, s82, 43 +; GCN-NEXT: v_writelane_b32 v39, s83, 44 +; GCN-NEXT: v_writelane_b32 v39, s84, 45 +; GCN-NEXT: v_writelane_b32 v39, s85, 46 +; GCN-NEXT: v_writelane_b32 v39, s86, 47 +; GCN-NEXT: v_writelane_b32 v39, s87, 48 +; GCN-NEXT: v_writelane_b32 v39, s88, 49 +; GCN-NEXT: v_writelane_b32 v39, s89, 50 +; GCN-NEXT: v_writelane_b32 v39, s90, 51 +; GCN-NEXT: v_writelane_b32 v39, s91, 52 +; GCN-NEXT: v_writelane_b32 v39, s92, 53 +; GCN-NEXT: v_writelane_b32 v39, s93, 54 +; GCN-NEXT: v_writelane_b32 v39, s94, 55 +; GCN-NEXT: v_writelane_b32 v39, s95, 56 +; GCN-NEXT: v_writelane_b32 v39, s96, 57 +; GCN-NEXT: v_writelane_b32 v39, s97, 58 +; GCN-NEXT: v_writelane_b32 v39, s98, 59 +; GCN-NEXT: v_writelane_b32 v39, s99, 60 +; GCN-NEXT: v_writelane_b32 v39, s100, 61 +; GCN-NEXT: v_writelane_b32 v39, s101, 62 +; GCN-NEXT: v_mov_b32_e32 v1, 0x1080 +; GCN-NEXT: v_writelane_b32 v39, s102, 63 +; GCN-NEXT: s_add_i32 s32, s32, 0x46000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber all VGPRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_add_i32 s5, s33, 0x42200 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_add_i32 s5, s33, 0x42300 +; GCN-NEXT: v_readlane_b32 s102, v39, 63 +; GCN-NEXT: v_readlane_b32 s101, v39, 62 +; GCN-NEXT: v_readlane_b32 s100, v39, 61 +; GCN-NEXT: v_readlane_b32 s99, v39, 60 +; GCN-NEXT: v_readlane_b32 s98, v39, 59 +; GCN-NEXT: v_readlane_b32 s97, v39, 58 +; GCN-NEXT: v_readlane_b32 s96, v39, 57 +; GCN-NEXT: v_readlane_b32 s95, v39, 56 +; GCN-NEXT: v_readlane_b32 s94, v39, 55 +; GCN-NEXT: v_readlane_b32 s93, v39, 54 +; GCN-NEXT: v_readlane_b32 s92, v39, 53 +; GCN-NEXT: v_readlane_b32 s91, v39, 52 +; GCN-NEXT: v_readlane_b32 s90, v39, 51 +; GCN-NEXT: v_readlane_b32 s89, v39, 50 +; GCN-NEXT: v_readlane_b32 s88, v39, 49 +; GCN-NEXT: v_readlane_b32 s87, v39, 48 +; GCN-NEXT: v_readlane_b32 s86, v39, 47 +; GCN-NEXT: v_readlane_b32 s85, v39, 46 +; GCN-NEXT: v_readlane_b32 s84, v39, 45 +; GCN-NEXT: v_readlane_b32 s83, v39, 44 +; GCN-NEXT: v_readlane_b32 s82, v39, 43 +; GCN-NEXT: v_readlane_b32 s81, v39, 42 +; GCN-NEXT: v_readlane_b32 s80, v39, 41 +; GCN-NEXT: v_readlane_b32 s79, v39, 40 +; GCN-NEXT: v_readlane_b32 s78, v39, 39 +; GCN-NEXT: v_readlane_b32 s77, v39, 38 +; GCN-NEXT: v_readlane_b32 s76, v39, 37 +; GCN-NEXT: v_readlane_b32 s75, v39, 36 +; GCN-NEXT: v_readlane_b32 s74, v39, 35 +; GCN-NEXT: v_readlane_b32 s73, v39, 34 +; GCN-NEXT: v_readlane_b32 s72, v39, 33 +; GCN-NEXT: v_readlane_b32 s71, v39, 32 +; GCN-NEXT: v_readlane_b32 s70, v39, 31 +; GCN-NEXT: v_readlane_b32 s69, v39, 30 +; GCN-NEXT: v_readlane_b32 s68, v39, 29 +; GCN-NEXT: v_readlane_b32 s67, v39, 28 +; GCN-NEXT: v_readlane_b32 s66, v39, 27 +; GCN-NEXT: v_readlane_b32 s65, v39, 26 +; GCN-NEXT: v_readlane_b32 s64, v39, 25 +; GCN-NEXT: v_readlane_b32 s63, v39, 24 +; GCN-NEXT: v_readlane_b32 s62, v39, 23 +; GCN-NEXT: v_readlane_b32 s61, v39, 22 +; GCN-NEXT: v_readlane_b32 s60, v39, 21 +; GCN-NEXT: v_readlane_b32 s59, v39, 20 +; GCN-NEXT: v_readlane_b32 s58, v39, 19 +; GCN-NEXT: v_readlane_b32 s57, v39, 18 +; GCN-NEXT: v_readlane_b32 s56, v39, 17 +; GCN-NEXT: v_readlane_b32 s55, v39, 16 +; GCN-NEXT: v_readlane_b32 s54, v39, 15 +; GCN-NEXT: v_readlane_b32 s53, v39, 14 +; GCN-NEXT: v_readlane_b32 s52, v39, 13 +; GCN-NEXT: v_readlane_b32 s51, v39, 12 +; GCN-NEXT: v_readlane_b32 s50, v39, 11 +; GCN-NEXT: v_readlane_b32 s49, v39, 10 +; GCN-NEXT: v_readlane_b32 s48, v39, 9 +; GCN-NEXT: v_readlane_b32 s47, v39, 8 +; GCN-NEXT: v_readlane_b32 s46, v39, 7 +; GCN-NEXT: v_readlane_b32 s45, v39, 6 +; GCN-NEXT: v_readlane_b32 s44, v39, 5 +; GCN-NEXT: v_readlane_b32 s43, v39, 4 +; GCN-NEXT: v_readlane_b32 s42, v39, 3 +; GCN-NEXT: v_readlane_b32 s41, v39, 2 +; GCN-NEXT: v_readlane_b32 s40, v39, 1 +; GCN-NEXT: v_readlane_b32 s39, v39, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 From 6a3007683bf2fa05989c12c787f5547788d09178 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 17 Feb 2025 08:26:33 -0800 Subject: [PATCH 032/127] [Analysis] Remove getGuaranteedNonPoisonOps (#127461) commit 0517772b4ac20c5d3a0de0d4703354a179833248 Author: Philip Reames Date: Thu Dec 19 14:14:11 2024 -0800 --- llvm/include/llvm/Analysis/ValueTracking.h | 5 ----- llvm/lib/Analysis/ValueTracking.cpp | 8 -------- 2 files changed, 13 deletions(-) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index 1b49f8a3e85b1..67f9f24c3b7a4 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -999,11 +999,6 @@ bool isGuaranteedToExecuteForEveryIteration(const Instruction *I, /// getGuaranteedNonPoisonOp. bool propagatesPoison(const Use &PoisonOp); -/// Insert operands of I into Ops such that I will trigger undefined behavior -/// if I is executed and that operand has a poison value. -void getGuaranteedNonPoisonOps(const Instruction *I, - SmallVectorImpl &Ops); - /// Return true if the given instruction must trigger undefined behavior /// when I is executed with any operands which appear in KnownPoison holding /// a poison value at the point of execution. diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index e4454c42c7857..91a5f194db9dc 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -8193,14 +8193,6 @@ static bool handleGuaranteedNonPoisonOps(const Instruction *I, } } -void llvm::getGuaranteedNonPoisonOps(const Instruction *I, - SmallVectorImpl &Operands) { - handleGuaranteedNonPoisonOps(I, [&](const Value *V) { - Operands.push_back(V); - return false; - }); -} - bool llvm::mustTriggerUB(const Instruction *I, const SmallPtrSetImpl &KnownPoison) { return handleGuaranteedNonPoisonOps( From 6d86a8a1a12856955aba5e06a6552ddafaaa208f Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 17 Feb 2025 16:58:09 +0000 Subject: [PATCH 033/127] LAA: scope responsibility of isNoWrapAddRec (NFC) (#127479) Free isNoWrapAddRec from the AddRec check, and rename it to isNoWrapGEP. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 34 ++++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7d6dbd51a404d..23bfd9989469a 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -793,7 +793,8 @@ class AccessAnalysis { } // end anonymous namespace -/// Try to compute the stride for \p AR. Used by getPtrStride. +/// Try to compute a constant stride for \p AR. Used by getPtrStride and +/// isNoWrap. static std::optional getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE) { @@ -835,16 +836,24 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, return Stride; } -static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, - PredicatedScalarEvolution &PSE, const Loop *L); +static bool isNoWrapGEP(Value *Ptr, PredicatedScalarEvolution &PSE, + const Loop *L); -/// Check whether a pointer address cannot wrap. +/// Check whether \p AR is a non-wrapping AddRec, or if \p Ptr is a non-wrapping +/// GEP. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, std::optional Stride = std::nullopt) { + // FIXME: This should probably only return true for NUW. + if (AR->getNoWrapFlags(SCEV::NoWrapMask)) + return true; + + if (PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) + return true; + // The address calculation must not wrap. Otherwise, a dependence could be // inverted. - if (isNoWrapAddRec(Ptr, AR, PSE, L)) + if (isNoWrapGEP(Ptr, PSE, L)) return true; // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, @@ -1445,18 +1454,9 @@ void AccessAnalysis::processMemAccesses() { } } -/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping, -/// i.e. monotonically increasing/decreasing. -static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, - PredicatedScalarEvolution &PSE, const Loop *L) { - - // FIXME: This should probably only return true for NUW. - if (AR->getNoWrapFlags(SCEV::NoWrapMask)) - return true; - - if (PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) - return true; - +/// Check whether \p Ptr is non-wrapping GEP. +static bool isNoWrapGEP(Value *Ptr, PredicatedScalarEvolution &PSE, + const Loop *L) { // Scalar evolution does not propagate the non-wrapping flags to values that // are derived from a non-wrapping induction variable because non-wrapping // could be flow-sensitive. From 15c2d1b328433d2c26327e072059c8960469d378 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 17 Feb 2025 17:36:02 +0000 Subject: [PATCH 034/127] [libclc] Fix dependencies on generated convert builtins (#127515) In #127378 it was reported that builds without clspv targets enabled were failing after #124727, as all targets had a dependency on a file that only clspv targets generated. A quick fix was merged in #127315 which wasn't correct. It moved the dependency on those generated files to the spirv targets, instead of onto the clspv targets. This means a build with spirv targets and without clspv targets would see the same problems as #127378 reported. I tried simply removing the requirement to explicitly add dependencies to the custom command, relying instead on the file-level dependencies. This didn't seem reliable enough; in some cases on a Makefiles build, the clang command compiling (e.g.,) convert.cl would begin before the file was fully written. Instead, we keep the target-level dependency but automatically infer it based on the generated file name, to avoid manual book-keeping of pairs of files and targets. This commit also fixes what looks like an unintended bug where, when ENABLE_RUNTIME_SUBNORMAL was enabled, the OpenCL conversions weren't being compiled. --- libclc/CMakeLists.txt | 22 ++++++++++++---------- libclc/cmake/modules/AddLibclc.cmake | 14 +++++++------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index c88ea9700d100..5cefa8a264310 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -243,30 +243,30 @@ add_custom_command( OUTPUT convert.cl COMMAND ${Python3_EXECUTABLE} ${script_loc} > convert.cl DEPENDS ${script_loc} ) -add_custom_target( "generate_convert.cl" DEPENDS convert.cl ) -set_target_properties( "generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) +add_custom_target( generate-convert.cl DEPENDS convert.cl ) +set_target_properties( generate-convert.cl PROPERTIES FOLDER "libclc/Sourcegenning" ) add_custom_command( OUTPUT clc-convert.cl COMMAND ${Python3_EXECUTABLE} ${script_loc} --clc > clc-convert.cl DEPENDS ${script_loc} ) -add_custom_target( "clc-generate_convert.cl" DEPENDS clc-convert.cl ) -set_target_properties( "clc-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) +add_custom_target( generate-clc-convert.cl DEPENDS clc-convert.cl ) +set_target_properties( generate-clc-convert.cl PROPERTIES FOLDER "libclc/Sourcegenning" ) if ( clspv-- IN_LIST LIBCLC_TARGETS_TO_BUILD OR clspv64-- IN_LIST LIBCLC_TARGETS_TO_BUILD ) add_custom_command( OUTPUT clspv-convert.cl COMMAND ${Python3_EXECUTABLE} ${script_loc} --clspv > clspv-convert.cl DEPENDS ${script_loc} ) - add_custom_target( "clspv-generate_convert.cl" DEPENDS clspv-convert.cl ) - set_target_properties( "clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) + add_custom_target( generate-clspv-convert.cl DEPENDS clspv-convert.cl ) + set_target_properties( generate-clspv-convert.cl PROPERTIES FOLDER "libclc/Sourcegenning" ) add_custom_command( OUTPUT clc-clspv-convert.cl COMMAND ${Python3_EXECUTABLE} ${script_loc} --clc --clspv > clc-clspv-convert.cl DEPENDS ${script_loc} ) - add_custom_target( "clc-clspv-generate_convert.cl" DEPENDS clc-clspv-convert.cl ) - set_target_properties( "clc-clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) + add_custom_target( generate-clc-clspv-convert.cl DEPENDS clc-clspv-convert.cl ) + set_target_properties( generate-clc-clspv-convert.cl PROPERTIES FOLDER "libclc/Sourcegenning" ) endif() enable_testing() @@ -324,9 +324,11 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) if( NOT ARCH STREQUAL spirv AND NOT ARCH STREQUAL spirv64 ) if( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) list( APPEND opencl_gen_files clspv-convert.cl ) - elseif ( NOT ENABLE_RUNTIME_SUBNORMAL ) + else() list( APPEND opencl_gen_files convert.cl ) - list( APPEND opencl_lib_files generic/lib/subnormal_use_default.ll ) + if ( NOT ENABLE_RUNTIME_SUBNORMAL ) + list( APPEND opencl_lib_files generic/lib/subnormal_use_default.ll ) + endif() endif() endif() diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index a3b311f12a1e3..5347b0822477b 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -230,11 +230,17 @@ function(add_libclc_builtin_set) # We need to take each file and produce an absolute input file, as well # as a unique architecture-specific output file. We deal with a mix of # different input files, which makes this trickier. + set( input_file_dep ) if( ${file} IN_LIST ARG_GEN_FILES ) # Generated files are given just as file names, which we must make # absolute to the binary directory. set( input_file ${CMAKE_CURRENT_BINARY_DIR}/${file} ) set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${file}.bc" ) + # If a target exists that generates this file, add that as a dependency + # of the custom command. + if( TARGET generate-${file} ) + set( input_file_dep generate-${file} ) + endif() else() # Other files are originally relative to each SOURCE file, which are # then make relative to the libclc root directory. We must normalize @@ -249,19 +255,13 @@ function(add_libclc_builtin_set) get_filename_component( file_dir ${file} DIRECTORY ) - if( ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64 ) - set(CONVERT_DEP clspv-generate_convert.cl) - else() - set(CONVERT_DEP generate_convert.cl) - endif() - compile_to_bc( TRIPLE ${ARG_TRIPLE} INPUT ${input_file} OUTPUT ${output_file} EXTRA_OPTS -fno-builtin -nostdlib "${ARG_COMPILE_FLAGS}" -I${CMAKE_CURRENT_SOURCE_DIR}/${file_dir} - DEPENDENCIES ${CONVERT_DEP} + DEPENDENCIES ${input_file_dep} ) list( APPEND bytecode_files ${output_file} ) endforeach() From 2dda529838e622e7a79b1e26d2899f319fd7e379 Mon Sep 17 00:00:00 2001 From: Marius Kamp Date: Mon, 17 Feb 2025 18:44:08 +0100 Subject: [PATCH 035/127] [AArch64] Fix Fold of Compare with Right-shifted Value (#127209) This change folds (setcc ne (lshr x c) 0) for 64-bit types and constants c >= 32. This fold already existed for other types or smaller constants but was not applicable to 64-bit types and constants >= 32 due to a comparison of the constant c with the bit size of the setcc operation. The type of this operation is legalized to i32, which does not necessarily match the type of the lshr operation. Use the bit size of the type of the lshr operation instead for the comparison. Fixes #122380. --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 +- llvm/test/CodeGen/AArch64/shift-const-ne-0.ll | 122 ++++++++++++++++++ 2 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/shift-const-ne-0.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4263be1098899..8f849af6f4d35 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25070,10 +25070,10 @@ static SDValue performSETCCCombine(SDNode *N, // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne if (Cond == ISD::SETNE && isNullConstant(RHS) && LHS->getOpcode() == ISD::SRL && isa(LHS->getOperand(1)) && - LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() && LHS->hasOneUse()) { EVT TstVT = LHS->getValueType(0); - if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) { + if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 && + LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) { // this pattern will get better opt in emitComparison uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1); SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0), diff --git a/llvm/test/CodeGen/AArch64/shift-const-ne-0.ll b/llvm/test/CodeGen/AArch64/shift-const-ne-0.ll new file mode 100644 index 0000000000000..be064d591613c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shift-const-ne-0.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-unknown-unknown < %s -o -| FileCheck %s + +define i1 @lsr_1_ne_0_16(i16 %x) { +; CHECK-LABEL: lsr_1_ne_0_16: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xfffe +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i16 %x, 1 + %cmp = icmp ne i16 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_1_ne_0_32(i32 %x) { +; CHECK-LABEL: lsr_1_ne_0_32: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xfffffffe +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i32 %x, 1 + %cmp = icmp ne i32 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_30_ne_0_32(i32 %x) { +; CHECK-LABEL: lsr_30_ne_0_32: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xc0000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i32 %x, 30 + %cmp = icmp ne i32 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_31_ne_0_32(i32 %x) { +; CHECK-LABEL: lsr_31_ne_0_32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr w0, w0, #31 +; CHECK-NEXT: ret + %shr = lshr i32 %x, 31 + %cmp = icmp ne i32 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_1_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_1_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xfffffffffffffffe +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 1 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_31_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_31_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xffffffff80000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 31 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_32_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_32_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xffffffff00000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 32 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_33_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_33_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xfffffffe00000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 33 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_62_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_62_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xc000000000000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 62 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_63_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_63_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr x0, x0, #63 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %shr = lshr i64 %x, 63 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define <4 x i1> @lsr_1_ne_0_v4i16(<4 x i16> %x) { +; CHECK-LABEL: lsr_1_ne_0_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4h, v0.4h, #1 +; CHECK-NEXT: cmtst v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret + %shr = lshr <4 x i16> %x, + %cmp = icmp ne <4 x i16> %shr, + ret <4 x i1> %cmp +} From 85f7ec12b86494f98f1ea28e51b38d52f2aecc8b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 17 Feb 2025 09:50:01 -0800 Subject: [PATCH 036/127] [RISCV] Remove unneeded unmasked patterns for vcpop_v and riscv_vfirst_vl. (#127435) The pseudos had RISCVMaskedPseudo add in #115162 so I we are able to convert the masked form to unmasked form automatically. --- llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index c914dc2ffbcd3..ffa3d3982647d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -2701,20 +2701,12 @@ foreach mti = AllMasks in { VR:$rs, VR:$rs, GPR:$vl, mti.Log2SEW)>; // 15.2 Vector count population in mask vcpop.m - def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask true_mask), - VLOpFrag)), - (!cast("PseudoVCPOP_M_" # mti.BX) - VR:$rs2, GPR:$vl, mti.Log2SEW)>; def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVCPOP_M_" # mti.BX # "_MASK") VR:$rs2, (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW)>; // 15.3 vfirst find-first-set mask bit - def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask true_mask), - VLOpFrag)), - (!cast("PseudoVFIRST_M_" # mti.BX) - VR:$rs2, GPR:$vl, mti.Log2SEW)>; def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVFIRST_M_" # mti.BX # "_MASK") From 62254f6615e453ee576a39557e4fc9ddb84965c2 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 17 Feb 2025 09:51:01 -0800 Subject: [PATCH 037/127] [Targets] Move *TargetStreamer.h files into their MCTargetDesc directory. (#127433) These files are included from MCTargetDesc so should be there instead of in the main directory for the target. --- llvm/lib/Target/ARC/{ => MCTargetDesc}/ARCTargetStreamer.h | 0 llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp | 2 +- llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp | 2 +- .../Target/Hexagon/{ => MCTargetDesc}/HexagonTargetStreamer.h | 0 llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 2 +- llvm/lib/Target/Mips/{ => MCTargetDesc}/MipsTargetStreamer.h | 0 llvm/lib/Target/Mips/MipsAsmPrinter.cpp | 2 +- llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp | 2 +- llvm/lib/Target/PowerPC/{ => MCTargetDesc}/PPCTargetStreamer.h | 0 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 2 +- llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp | 2 +- .../Target/SystemZ/{ => MCTargetDesc}/SystemZTargetStreamer.h | 0 llvm/lib/Target/SystemZ/SystemZAsmPrinter.h | 2 +- llvm/lib/Target/XCore/{ => MCTargetDesc}/XCoreTargetStreamer.h | 0 llvm/lib/Target/XCore/XCoreAsmPrinter.cpp | 2 +- 15 files changed, 9 insertions(+), 9 deletions(-) rename llvm/lib/Target/ARC/{ => MCTargetDesc}/ARCTargetStreamer.h (100%) rename llvm/lib/Target/Hexagon/{ => MCTargetDesc}/HexagonTargetStreamer.h (100%) rename llvm/lib/Target/Mips/{ => MCTargetDesc}/MipsTargetStreamer.h (100%) rename llvm/lib/Target/PowerPC/{ => MCTargetDesc}/PPCTargetStreamer.h (100%) rename llvm/lib/Target/SystemZ/{ => MCTargetDesc}/SystemZTargetStreamer.h (100%) rename llvm/lib/Target/XCore/{ => MCTargetDesc}/XCoreTargetStreamer.h (100%) diff --git a/llvm/lib/Target/ARC/ARCTargetStreamer.h b/llvm/lib/Target/ARC/MCTargetDesc/ARCTargetStreamer.h similarity index 100% rename from llvm/lib/Target/ARC/ARCTargetStreamer.h rename to llvm/lib/Target/ARC/MCTargetDesc/ARCTargetStreamer.h diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 20881de1d94f4..f8c57fc5e0058 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// -#include "HexagonTargetStreamer.h" #include "MCTargetDesc/HexagonMCChecker.h" #include "MCTargetDesc/HexagonMCELFStreamer.h" #include "MCTargetDesc/HexagonMCExpr.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" #include "MCTargetDesc/HexagonShuffler.h" +#include "MCTargetDesc/HexagonTargetStreamer.h" #include "TargetInfo/HexagonTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index f10122fdacfcd..c6f250353f736 100644 --- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -16,11 +16,11 @@ #include "HexagonInstrInfo.h" #include "HexagonRegisterInfo.h" #include "HexagonSubtarget.h" -#include "HexagonTargetStreamer.h" #include "MCTargetDesc/HexagonInstPrinter.h" #include "MCTargetDesc/HexagonMCExpr.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "MCTargetDesc/HexagonTargetStreamer.h" #include "TargetInfo/HexagonTargetInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonTargetStreamer.h similarity index 100% rename from llvm/lib/Target/Hexagon/HexagonTargetStreamer.h rename to llvm/lib/Target/Hexagon/MCTargetDesc/HexagonTargetStreamer.h diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index d108564e128c0..8c328d5ed7234 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -11,7 +11,7 @@ #include "MCTargetDesc/MipsBaseInfo.h" #include "MCTargetDesc/MipsMCExpr.h" #include "MCTargetDesc/MipsMCTargetDesc.h" -#include "MipsTargetStreamer.h" +#include "MCTargetDesc/MipsTargetStreamer.h" #include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Target/Mips/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h similarity index 100% rename from llvm/lib/Target/Mips/MipsTargetStreamer.h rename to llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index b0b7b5dc7a31d..e06a9b36bfe4f 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -17,12 +17,12 @@ #include "MCTargetDesc/MipsInstPrinter.h" #include "MCTargetDesc/MipsMCNaCl.h" #include "MCTargetDesc/MipsMCTargetDesc.h" +#include "MCTargetDesc/MipsTargetStreamer.h" #include "Mips.h" #include "MipsMCInstLower.h" #include "MipsMachineFunction.h" #include "MipsSubtarget.h" #include "MipsTargetMachine.h" -#include "MipsTargetStreamer.h" #include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index dc75814b9796b..016e4f9f7c6b6 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -8,8 +8,8 @@ #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCTargetStreamer.h" #include "PPCInstrInfo.h" -#include "PPCTargetStreamer.h" #include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" diff --git a/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCTargetStreamer.h similarity index 100% rename from llvm/lib/Target/PowerPC/PPCTargetStreamer.h rename to llvm/lib/Target/PowerPC/MCTargetDesc/PPCTargetStreamer.h diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 162d11058266f..5784fe43879fe 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -19,12 +19,12 @@ #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCPredicates.h" +#include "MCTargetDesc/PPCTargetStreamer.h" #include "PPC.h" #include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" -#include "PPCTargetStreamer.h" #include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index b892c9ea69602..4b26437c5fecb 100644 --- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -9,7 +9,7 @@ #include "MCTargetDesc/SystemZGNUInstPrinter.h" #include "MCTargetDesc/SystemZMCAsmInfo.h" #include "MCTargetDesc/SystemZMCTargetDesc.h" -#include "SystemZTargetStreamer.h" +#include "MCTargetDesc/SystemZTargetStreamer.h" #include "TargetInfo/SystemZTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.h similarity index 100% rename from llvm/lib/Target/SystemZ/SystemZTargetStreamer.h rename to llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.h diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h index 2696702b44551..47e7f67e2cdc7 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h @@ -9,9 +9,9 @@ #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H +#include "MCTargetDesc/SystemZTargetStreamer.h" #include "SystemZMCInstLower.h" #include "SystemZTargetMachine.h" -#include "SystemZTargetStreamer.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/MC/MCInstBuilder.h" diff --git a/llvm/lib/Target/XCore/XCoreTargetStreamer.h b/llvm/lib/Target/XCore/MCTargetDesc/XCoreTargetStreamer.h similarity index 100% rename from llvm/lib/Target/XCore/XCoreTargetStreamer.h rename to llvm/lib/Target/XCore/MCTargetDesc/XCoreTargetStreamer.h diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp index 15be47a73cef3..a1f7608224b90 100644 --- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -12,12 +12,12 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/XCoreInstPrinter.h" +#include "MCTargetDesc/XCoreTargetStreamer.h" #include "TargetInfo/XCoreTargetInfo.h" #include "XCore.h" #include "XCoreMCInstLower.h" #include "XCoreSubtarget.h" #include "XCoreTargetMachine.h" -#include "XCoreTargetStreamer.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/AsmPrinter.h" From 74656476b860be93ccaac12b62d81679166207fd Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Mon, 17 Feb 2025 19:01:25 +0100 Subject: [PATCH 038/127] [mlir][x86vector] Fix integration tests lowering (#124934) Fixes MLIR lowering passes in x86vector integration tests. The tests are refactored with lowering pass bundle which ensures that all dialect are lowered into LLVM dialect. This simplifies the test pipelines and addresses missing arith lowering. --- mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir | 2 +- .../Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir | 3 +-- .../Dialect/Vector/CPU/X86Vector/mask-compress.mlir | 2 +- mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir | 2 +- .../Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir | 2 +- .../Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir | 2 +- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir index 4b901289d1a4b..53a7282e1f141 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir index 828e498543a9f..8376464cee42d 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-vector-to-scf='full-unroll=true' -lower-affine -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1' -convert-arith-to-llvm -reconcile-unrealized-casts |\ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-vector-to-scf='full-unroll=true' -test-lower-to-llvm |\ // RUN: mlir-translate --mlir-to-llvmir |\ // RUN: %lli --entry-function=entry --mattr="avx512f" --dlopen=%mlir_c_runner_utils |\ // RUN: FileCheck %s @@ -37,4 +37,3 @@ module { llvm.return %i0 : i32 } } - diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/mask-compress.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/mask-compress.mlir index f1d7caeb4f3da..eda9138d222a0 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/mask-compress.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/mask-compress.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx512bw" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir index 225f9963aeeea..6cc4e6ca69fe3 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir index f665891536ada..bf1caaafa3ff4 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx512bw,avx512vp2intersect" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir index 2eccf00f221a7..46124c2ba87c4 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx512bw,avx512vp2intersect" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s From 941f7cbf5a3e7aa9f36b002dc22cfdb4ff50fea8 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Mon, 17 Feb 2025 19:08:07 +0100 Subject: [PATCH 039/127] [libc++][TZDB] Fixes mapping of nonexisting time. (#127330) All non-existing local times in a contiguous range should map to the same time point. This fixes a bug, were the times inside the range were mapped to the wrong time. Fixes: #113654 --- libcxx/include/__chrono/time_zone.h | 8 ++++++-- .../time.zone.members/to_sys_choose.pass.cpp | 17 +++++++++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/libcxx/include/__chrono/time_zone.h b/libcxx/include/__chrono/time_zone.h index ab5c22eceaaf1..d18d59d2736bf 100644 --- a/libcxx/include/__chrono/time_zone.h +++ b/libcxx/include/__chrono/time_zone.h @@ -103,10 +103,14 @@ class _LIBCPP_AVAILABILITY_TZDB time_zone { to_sys(const local_time<_Duration>& __time, choose __z) const { local_info __info = get_info(__time); switch (__info.result) { - case local_info::unique: - case local_info::nonexistent: // first and second are the same + case local_info::unique: // first and second are the same return sys_time>{__time.time_since_epoch() - __info.first.offset}; + case local_info::nonexistent: + // first and second are the same + // All non-existing values are converted to the same time. + return sys_time>{__info.first.end}; + case local_info::ambiguous: switch (__z) { case choose::earliest: diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp index bad4ef352e9b9..1147c9fadf9ae 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp @@ -88,7 +88,7 @@ static void test_nonexistent() { // Pick an historic date where it's well known what the time zone rules were. // This makes it unlikely updates to the database change these rules. std::chrono::local_time time{ - (std::chrono::sys_days{std::chrono::March / 30 / 1986} + 2h + 30min).time_since_epoch()}; + (std::chrono::sys_days{std::chrono::March / 30 / 1986} + 2h).time_since_epoch()}; std::chrono::sys_seconds expected{time.time_since_epoch() - 1h}; @@ -100,6 +100,13 @@ static void test_nonexistent() { assert(tz->to_sys(time + 0us, std::chrono::choose::latest) == expected); assert(tz->to_sys(time + 0ms, std::chrono::choose::earliest) == expected); assert(tz->to_sys(time + 0s, std::chrono::choose::latest) == expected); + + // The entire nonexisting hour should map to the same time. + // For nonexistant the value of std::chrono::choose has no effect. + assert(tz->to_sys(time + 1s, std::chrono::choose::earliest) == expected); + assert(tz->to_sys(time + 1min, std::chrono::choose::latest) == expected); + assert(tz->to_sys(time + 30min, std::chrono::choose::earliest) == expected); + assert(tz->to_sys(time + 59min + 59s, std::chrono::choose::latest) == expected); } // Tests ambiguous conversions. @@ -120,7 +127,7 @@ static void test_ambiguous() { // Pick an historic date where it's well known what the time zone rules were. // This makes it unlikely updates to the database change these rules. std::chrono::local_time time{ - (std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h + 30min).time_since_epoch()}; + (std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h).time_since_epoch()}; std::chrono::sys_seconds earlier{time.time_since_epoch() - 2h}; std::chrono::sys_seconds later{time.time_since_epoch() - 1h}; @@ -133,6 +140,12 @@ static void test_ambiguous() { assert(tz->to_sys(time + 0us, std::chrono::choose::latest) == later); assert(tz->to_sys(time + 0ms, std::chrono::choose::earliest) == earlier); assert(tz->to_sys(time + 0s, std::chrono::choose::latest) == later); + + // Test times in the ambigious hour + assert(tz->to_sys(time + 1s, std::chrono::choose::earliest) == earlier + 1s); + assert(tz->to_sys(time + 1min, std::chrono::choose::latest) == later + 1min); + assert(tz->to_sys(time + 30min, std::chrono::choose::earliest) == earlier + 30min); + assert(tz->to_sys(time + 59min + 59s, std::chrono::choose::latest) == later + 59min + 59s); } // This test does the basic validations of this function. The library function From eaa460ca499bec0547393bae7c18b128c2926839 Mon Sep 17 00:00:00 2001 From: Scott Linder Date: Mon, 17 Feb 2025 13:27:23 -0500 Subject: [PATCH 040/127] [AMDGPU] Remove dead function metadata after amdgpu-lower-kernel-arguments (#126147) The verifier ensures function !dbg metadata is unique across the module, so ensure the old nameless function we leave behind doesn't violate this invariant. Removing the function via e.g. eraseFromParent seems like a better option, but doesn't seem to be legal from a FunctionPass. --- .../lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 1 + .../AMDGPU/preload-implicit-kernargs-debug-info.ll | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index e9d009baa20af..09412d1b0f1cc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -132,6 +132,7 @@ class PreloadKernelArgInfo { NF->setAttributes(AL); F.replaceAllUsesWith(NF); F.setCallingConv(CallingConv::C); + F.clearMetadata(); return NF; } diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll index a6a0b88dce125..b008f397318e8 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll @@ -1,7 +1,13 @@ -; RUN: not --crash opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ +; RUN: | FileCheck --match-full-lines --implicit-check-not='declare' %s -; CHECK: function declaration may only have a unique !dbg attachment -; CHECK-NEXT: ptr @0 +; Confirms we do not leave behind a declaration which references the same +; DISubprogram metadata. + +; CHECK: define amdgpu_kernel void @preload_block_count_x{{.*}} !dbg ![[#]] !max_work_group_size ![[#]] { +; CHECK: declare void @0{{.*}} #[[#]] +; CHECK: declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #[[#]] +; CHECK: declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #[[#]] define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) !dbg !4 !max_work_group_size !7 { %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() From 29ca3b8b28cb42ee796f40fe40f5f9ddc1ea2f42 Mon Sep 17 00:00:00 2001 From: Scott Linder Date: Mon, 17 Feb 2025 13:29:56 -0500 Subject: [PATCH 041/127] [AMDGPU] Push amdgpu-preload-kern-arg-prolog after livedebugvalues (#126148) This is effectively a workaround for a bug in livedebugvalues, but seems to potentially be a general improvement, as BB sections seems like it could ruin the special 256-byte prelude scheme that amdgpu-preload-kern-arg-prolog requires anyway. Moving it even later doesn't seem to have any material impact, and just adds livedebugvalues to the list of things which no longer have to deal with pseudo multiple-entry functions. AMDGPU debug-info isn't supported upstream yet, so the bug being avoided isn't testable here. I am posting the patch upstream to avoid an unnecessary diff with AMD's fork. --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 ++++++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 10 +++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index eb488843b53e0..92ab106dd4a98 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1151,6 +1151,7 @@ class GCNPassConfig final : public AMDGPUPassConfig { void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + void addPostBBSections() override; }; } // end anonymous namespace @@ -1690,6 +1691,11 @@ void GCNPassConfig::addPreEmitPass() { addPass(&AMDGPUInsertDelayAluID); addPass(&BranchRelaxationPassID); +} + +void GCNPassConfig::addPostBBSections() { + // We run this later to avoid passes like livedebugvalues and BBSections + // having to deal with the apparent multi-entry functions we may generate. addPass(createAMDGPUPreloadKernArgPrologLegacyPass()); } diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 893b9fa6fb40d..d7f54f3b8e9e2 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -145,11 +145,11 @@ ; GCN-O0-NEXT: Post RA hazard recognizer ; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O0-NEXT: Branch relaxation pass -; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O0-NEXT: Register Usage Information Collector Pass ; GCN-O0-NEXT: Remove Loads Into Fake Uses ; GCN-O0-NEXT: Live DEBUG_VALUE analysis ; GCN-O0-NEXT: Machine Sanitizer Binary Metadata +; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O0-NEXT: Machine Optimization Remark Emitter ; GCN-O0-NEXT: Stack Frame Layout Analysis @@ -430,11 +430,11 @@ ; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O1-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-NEXT: Branch relaxation pass -; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-NEXT: Register Usage Information Collector Pass ; GCN-O1-NEXT: Remove Loads Into Fake Uses ; GCN-O1-NEXT: Live DEBUG_VALUE analysis ; GCN-O1-NEXT: Machine Sanitizer Binary Metadata +; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Stack Frame Layout Analysis @@ -743,11 +743,11 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-OPTS-NEXT: Branch relaxation pass -; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass ; GCN-O1-OPTS-NEXT: Remove Loads Into Fake Uses ; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis ; GCN-O1-OPTS-NEXT: Machine Sanitizer Binary Metadata +; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis @@ -1062,11 +1062,11 @@ ; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O2-NEXT: AMDGPU Insert Delay ALU ; GCN-O2-NEXT: Branch relaxation pass -; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O2-NEXT: Register Usage Information Collector Pass ; GCN-O2-NEXT: Remove Loads Into Fake Uses ; GCN-O2-NEXT: Live DEBUG_VALUE analysis ; GCN-O2-NEXT: Machine Sanitizer Binary Metadata +; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Stack Frame Layout Analysis @@ -1394,11 +1394,11 @@ ; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O3-NEXT: AMDGPU Insert Delay ALU ; GCN-O3-NEXT: Branch relaxation pass -; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O3-NEXT: Register Usage Information Collector Pass ; GCN-O3-NEXT: Remove Loads Into Fake Uses ; GCN-O3-NEXT: Live DEBUG_VALUE analysis ; GCN-O3-NEXT: Machine Sanitizer Binary Metadata +; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Stack Frame Layout Analysis From 0d2722c20d75b237524dd4ec87a1d3da707ec96e Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 17 Feb 2025 12:41:11 -0600 Subject: [PATCH 042/127] [libc][Docs] Add proper 'offload' build to use libc with offload Summary: Since this was added the offloading target now requires `offload`. Fixes: https://github.com/llvm/llvm-project/issues/127458 --- libc/docs/gpu/building.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst index 94d3f1f644e5c..9f9528b30d9bf 100644 --- a/libc/docs/gpu/building.rst +++ b/libc/docs/gpu/building.rst @@ -43,7 +43,7 @@ arguments automatically. $> cd build $> cmake ../llvm -G Ninja \ -DLLVM_ENABLE_PROJECTS="clang;lld" \ - -DLLVM_ENABLE_RUNTIMES="openmp" \ + -DLLVM_ENABLE_RUNTIMES="openmp;offload" \ -DCMAKE_BUILD_TYPE= \ # Select build type -DCMAKE_INSTALL_PREFIX= \ # Where the libraries will live -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=libc \ From 15944056aa5c1ab2c777dd2e3b4f19b8a1f1403d Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Mon, 17 Feb 2025 20:03:32 +0100 Subject: [PATCH 043/127] [LLD][COFF] Split native and EC .CRT chunks on ARM64X (#127203) --- lld/COFF/Writer.cpp | 11 +++++++++ lld/COFF/Writer.h | 3 +++ lld/test/COFF/arm64x-crt-sec.s | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 lld/test/COFF/arm64x-crt-sec.s diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 678de915b6cdb..504558087c80d 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -403,6 +403,12 @@ void OutputSection::addContributingPartialSection(PartialSection *sec) { contribSections.push_back(sec); } +void OutputSection::splitECChunks() { + llvm::stable_sort(chunks, [=](const Chunk *a, const Chunk *b) { + return (a->getMachine() != ARM64) < (b->getMachine() != ARM64); + }); +} + // Check whether the target address S is in range from a relocation // of type relType at address P. bool Writer::isInRange(uint16_t relType, uint64_t s, uint64_t p, int margin, @@ -1156,6 +1162,11 @@ void Writer::createSections() { sec->addContributingPartialSection(pSec); } + if (ctx.hybridSymtab) { + if (OutputSection *sec = findSection(".CRT")) + sec->splitECChunks(); + } + // Finally, move some output sections to the end. auto sectionOrder = [&](const OutputSection *s) { // Move DISCARDABLE (or non-memory-mapped) sections to the end of file diff --git a/lld/COFF/Writer.h b/lld/COFF/Writer.h index 9004bb310d073..7e458b766bae8 100644 --- a/lld/COFF/Writer.h +++ b/lld/COFF/Writer.h @@ -50,6 +50,9 @@ class OutputSection { void writeHeaderTo(uint8_t *buf, bool isDebug); void addContributingPartialSection(PartialSection *sec); + // Sort chunks to split native and EC sections on hybrid targets. + void splitECChunks(); + // Returns the size of this section in an executable memory image. // This may be smaller than the raw size (the raw size is multiple // of disk sector size, so there may be padding at end), or may be diff --git a/lld/test/COFF/arm64x-crt-sec.s b/lld/test/COFF/arm64x-crt-sec.s new file mode 100644 index 0000000000000..5be70a1845f12 --- /dev/null +++ b/lld/test/COFF/arm64x-crt-sec.s @@ -0,0 +1,42 @@ +// REQUIRES: aarch64, x86 +// RUN: split-file %s %t.dir && cd %t.dir + +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows crt1-arm64.s -o crt1-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows crt2-arm64.s -o crt2-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows crt1-arm64ec.s -o crt1-arm64ec.obj +// RUN: llvm-mc -filetype=obj -triple=x86_64-windows crt2-amd64.s -o crt2-amd64.obj + +// Check that .CRT chunks are correctly sorted and that EC and native chunks are split. + +// RUN: lld-link -out:out.dll -machine:arm64x -dll -noentry crt1-arm64.obj crt2-arm64.obj crt1-arm64ec.obj crt2-amd64.obj +// RUN: llvm-readobj --hex-dump=.CRT out.dll | FileCheck %s + +// RUN: lld-link -out:out2.dll -machine:arm64x -dll -noentry crt1-arm64.obj crt1-arm64ec.obj crt2-arm64.obj crt2-amd64.obj +// RUN: llvm-readobj --hex-dump=.CRT out2.dll | FileCheck %s + +// RUN: lld-link -out:out3.dll -machine:arm64x -dll -noentry crt2-amd64.obj crt1-arm64ec.obj crt2-arm64.obj crt1-arm64.obj +// RUN: llvm-readobj --hex-dump=.CRT out3.dll | FileCheck %s + +// CHECK: 0x180002000 01000000 00000000 02000000 00000000 +// CHECK-NEXT: 0x180002010 03000000 00000000 11000000 00000000 +// CHECK-NEXT: 0x180002020 12000000 00000000 13000000 00000000 + +#--- crt1-arm64.s + .section .CRT$A,"dr" + .xword 1 + .section .CRT$Z,"dr" + .xword 3 + +#--- crt2-arm64.s + .section .CRT$B,"dr" + .xword 2 + +#--- crt1-arm64ec.s + .section .CRT$A,"dr" + .xword 0x11 + .section .CRT$Z,"dr" + .xword 0x13 + +#--- crt2-amd64.s + .section .CRT$B,"dr" + .quad 0x12 From f0e39c45df2075ac338bc06b595079da8466b695 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Mon, 17 Feb 2025 11:18:45 -0800 Subject: [PATCH 044/127] [AArch64] Add aliases for processors apple-a18/s6..10. (#127152) apple-a18 is an alias of apple-m4. apple-s6/s7/s8 are aliases of apple-a13. apple-s9/s10 are aliases of apple-a16. As with some other aliases today, this reflects identical ISA feature support, but not necessarily identical microarchitectures and performance characteristics. --- clang/test/Driver/print-supported-cpus-aarch64.c | 6 ++++++ clang/test/Misc/target-invalid-cpu-note/aarch64.c | 6 ++++++ llvm/lib/Target/AArch64/AArch64Processors.td | 6 ++++++ llvm/unittests/TargetParser/TargetParserTest.cpp | 14 ++++++++++++-- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/clang/test/Driver/print-supported-cpus-aarch64.c b/clang/test/Driver/print-supported-cpus-aarch64.c index 3c1dcebf7c6c8..3a0ccaf015428 100644 --- a/clang/test/Driver/print-supported-cpus-aarch64.c +++ b/clang/test/Driver/print-supported-cpus-aarch64.c @@ -14,6 +14,7 @@ // CHECK: apple-a15 // CHECK: apple-a16 // CHECK: apple-a17 +// CHECK: apple-a18 // CHECK: apple-a7 // CHECK: apple-a8 // CHECK: apple-a9 @@ -21,7 +22,12 @@ // CHECK: apple-m2 // CHECK: apple-m3 // CHECK: apple-m4 +// CHECK: apple-s10 // CHECK: apple-s4 // CHECK: apple-s5 +// CHECK: apple-s6 +// CHECK: apple-s7 +// CHECK: apple-s8 +// CHECK: apple-s9 // CHECK: Use -mcpu or -mtune to specify the target's processor. diff --git a/clang/test/Misc/target-invalid-cpu-note/aarch64.c b/clang/test/Misc/target-invalid-cpu-note/aarch64.c index e6ff09557fe07..98a2ca0447bcf 100644 --- a/clang/test/Misc/target-invalid-cpu-note/aarch64.c +++ b/clang/test/Misc/target-invalid-cpu-note/aarch64.c @@ -19,6 +19,7 @@ // CHECK-SAME: {{^}}, apple-a15 // CHECK-SAME: {{^}}, apple-a16 // CHECK-SAME: {{^}}, apple-a17 +// CHECK-SAME: {{^}}, apple-a18 // CHECK-SAME: {{^}}, apple-a7 // CHECK-SAME: {{^}}, apple-a8 // CHECK-SAME: {{^}}, apple-a9 @@ -26,8 +27,13 @@ // CHECK-SAME: {{^}}, apple-m2 // CHECK-SAME: {{^}}, apple-m3 // CHECK-SAME: {{^}}, apple-m4 +// CHECK-SAME: {{^}}, apple-s10 // CHECK-SAME: {{^}}, apple-s4 // CHECK-SAME: {{^}}, apple-s5 +// CHECK-SAME: {{^}}, apple-s6 +// CHECK-SAME: {{^}}, apple-s7 +// CHECK-SAME: {{^}}, apple-s8 +// CHECK-SAME: {{^}}, apple-s9 // CHECK-SAME: {{^}}, carmel // CHECK-SAME: {{^}}, cobalt-100 // CHECK-SAME: {{^}}, cortex-a34 diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index d1d4986d12550..b977b6aaaf619 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -1224,6 +1224,9 @@ def : ProcessorAlias<"apple-s5", "apple-a12">; def : ProcessorModel<"apple-a13", CycloneModel, ProcessorFeatures.AppleA13, [TuneAppleA13]>; +def : ProcessorAlias<"apple-s6", "apple-a13">; +def : ProcessorAlias<"apple-s7", "apple-a13">; +def : ProcessorAlias<"apple-s8", "apple-a13">; def : ProcessorModel<"apple-a14", CycloneModel, ProcessorFeatures.AppleA14, [TuneAppleA14]>; @@ -1236,12 +1239,15 @@ def : ProcessorAlias<"apple-m2", "apple-a15">; def : ProcessorModel<"apple-a16", CycloneModel, ProcessorFeatures.AppleA16, [TuneAppleA16]>; def : ProcessorAlias<"apple-m3", "apple-a16">; +def : ProcessorAlias<"apple-s9", "apple-a16">; +def : ProcessorAlias<"apple-s10", "apple-a16">; def : ProcessorModel<"apple-a17", CycloneModel, ProcessorFeatures.AppleA17, [TuneAppleA17]>; def : ProcessorModel<"apple-m4", CycloneModel, ProcessorFeatures.AppleM4, [TuneAppleM4]>; +def : ProcessorAlias<"apple-a18", "apple-m4">; // Alias for the latest Apple processor model supported by LLVM. def : ProcessorAlias<"apple-latest", "apple-m4">; diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 7fee62721e6e0..93ac7381b02ef 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1130,14 +1130,20 @@ INSTANTIATE_TEST_SUITE_P( AArch64CPUTestParams("apple-s4", "armv8.3-a"), AArch64CPUTestParams("apple-s5", "armv8.3-a"), AArch64CPUTestParams("apple-a13", "armv8.4-a"), + AArch64CPUTestParams("apple-s6", "armv8.4-a"), + AArch64CPUTestParams("apple-s7", "armv8.4-a"), + AArch64CPUTestParams("apple-s8", "armv8.4-a"), AArch64CPUTestParams("apple-a14", "armv8.4-a"), AArch64CPUTestParams("apple-m1", "armv8.4-a"), AArch64CPUTestParams("apple-a15", "armv8.6-a"), AArch64CPUTestParams("apple-m2", "armv8.6-a"), AArch64CPUTestParams("apple-a16", "armv8.6-a"), AArch64CPUTestParams("apple-m3", "armv8.6-a"), + AArch64CPUTestParams("apple-s9", "armv8.6-a"), + AArch64CPUTestParams("apple-s10", "armv8.6-a"), AArch64CPUTestParams("apple-a17", "armv8.6-a"), AArch64CPUTestParams("apple-m4", "armv8.7-a"), + AArch64CPUTestParams("apple-a18", "armv8.7-a"), AArch64CPUTestParams("exynos-m3", "armv8-a"), AArch64CPUTestParams("exynos-m4", "armv8.2-a"), AArch64CPUTestParams("exynos-m5", "armv8.2-a"), @@ -1246,13 +1252,17 @@ INSTANTIATE_TEST_SUITE_P( "apple-a8", "apple-a9"}), AArch64CPUAliasTestParams({"apple-a12", "apple-s4", "apple-s5"}), + AArch64CPUAliasTestParams({"apple-a13", "apple-s6", + "apple-s7", "apple-s8"}), AArch64CPUAliasTestParams({"apple-a14", "apple-m1"}), AArch64CPUAliasTestParams({"apple-a15", "apple-m2"}), - AArch64CPUAliasTestParams({"apple-a16", "apple-m3"})), + AArch64CPUAliasTestParams({"apple-a16", "apple-m3", + "apple-s9", "apple-s10"}), + AArch64CPUAliasTestParams({"apple-m4", "apple-a18"})), AArch64CPUAliasTestParams::PrintToStringParamName); // Note: number of CPUs includes aliases. -static constexpr unsigned NumAArch64CPUArchs = 82; +static constexpr unsigned NumAArch64CPUArchs = 88; TEST(TargetParserTest, testAArch64CPUArchList) { SmallVector List; From 6fde8fe9adc835df50ea57b710781ffe8a6657e8 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Mon, 17 Feb 2025 11:19:14 -0800 Subject: [PATCH 045/127] [lldb] Provide default impl for MightHaveChildren (NFC) (#119977) The vast majority of `SyntheticChildrenFrontEnd` subclasses provide children, and as such implement `MightHaveChildren` with a constant value of `true`. This change makes `true` the default value. With this change, `MightHaveChildren` only needs to be implemented by synthetic providers that can return `false`, which is only 3 subclasses. --- .../lldb/DataFormatters/TypeSynthetic.h | 2 +- .../lldb/DataFormatters/VectorIterator.h | 2 - lldb/source/DataFormatters/VectorType.cpp | 2 - .../Language/CPlusPlus/BlockPointer.cpp | 3 -- .../Plugins/Language/CPlusPlus/Coroutines.cpp | 5 --- .../Plugins/Language/CPlusPlus/Coroutines.h | 2 - .../Language/CPlusPlus/GenericBitset.cpp | 1 - .../Language/CPlusPlus/GenericOptional.cpp | 1 - .../Plugins/Language/CPlusPlus/LibCxx.cpp | 10 ----- .../Plugins/Language/CPlusPlus/LibCxx.h | 4 -- .../Language/CPlusPlus/LibCxxAtomic.cpp | 7 --- .../CPlusPlus/LibCxxInitializerList.cpp | 7 --- .../Plugins/Language/CPlusPlus/LibCxxList.cpp | 1 - .../Plugins/Language/CPlusPlus/LibCxxMap.cpp | 14 ------ .../Language/CPlusPlus/LibCxxProxyArray.cpp | 7 --- .../Language/CPlusPlus/LibCxxQueue.cpp | 1 - .../CPlusPlus/LibCxxRangesRefView.cpp | 2 - .../Language/CPlusPlus/LibCxxSliceArray.cpp | 7 --- .../Plugins/Language/CPlusPlus/LibCxxSpan.cpp | 7 --- .../Language/CPlusPlus/LibCxxTuple.cpp | 1 - .../Language/CPlusPlus/LibCxxUnorderedMap.cpp | 14 ------ .../Language/CPlusPlus/LibCxxValarray.cpp | 7 --- .../Language/CPlusPlus/LibCxxVariant.cpp | 1 - .../Language/CPlusPlus/LibCxxVector.cpp | 9 ---- .../Plugins/Language/CPlusPlus/LibStdcpp.cpp | 10 ----- .../Language/CPlusPlus/LibStdcppTuple.cpp | 4 -- .../CPlusPlus/LibStdcppUniquePointer.cpp | 4 -- lldb/source/Plugins/Language/ObjC/NSArray.cpp | 22 --------- .../Plugins/Language/ObjC/NSDictionary.cpp | 45 ------------------- lldb/source/Plugins/Language/ObjC/NSError.cpp | 2 - .../Plugins/Language/ObjC/NSException.cpp | 2 - lldb/source/Plugins/Language/ObjC/NSSet.cpp | 40 +---------------- 32 files changed, 2 insertions(+), 244 deletions(-) diff --git a/lldb/include/lldb/DataFormatters/TypeSynthetic.h b/lldb/include/lldb/DataFormatters/TypeSynthetic.h index bf6dc6a0c3c6b..14e516964f250 100644 --- a/lldb/include/lldb/DataFormatters/TypeSynthetic.h +++ b/lldb/include/lldb/DataFormatters/TypeSynthetic.h @@ -68,7 +68,7 @@ class SyntheticChildrenFrontEnd { // a false return value from this call if it returns true, then // CalculateNumChildren() can return any number >= 0 (0 being valid) it // should if at all possible be more efficient than CalculateNumChildren() - virtual bool MightHaveChildren() = 0; + virtual bool MightHaveChildren() { return true; } // if this function returns a non-null ValueObject, then the returned // ValueObject will stand for this ValueObject whenever a "value" request is diff --git a/lldb/include/lldb/DataFormatters/VectorIterator.h b/lldb/include/lldb/DataFormatters/VectorIterator.h index 70bcf50ca1b1d..d095f085cabab 100644 --- a/lldb/include/lldb/DataFormatters/VectorIterator.h +++ b/lldb/include/lldb/DataFormatters/VectorIterator.h @@ -30,8 +30,6 @@ class VectorIteratorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: diff --git a/lldb/source/DataFormatters/VectorType.cpp b/lldb/source/DataFormatters/VectorType.cpp index cba107b7da890..fa3fb1b674efb 100644 --- a/lldb/source/DataFormatters/VectorType.cpp +++ b/lldb/source/DataFormatters/VectorType.cpp @@ -268,8 +268,6 @@ class VectorTypeSyntheticFrontEnd : public SyntheticChildrenFrontEnd { return lldb::ChildCacheState::eRefetch; } - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { const char *item_name = name.GetCString(); uint32_t idx = ExtractIndexFromString(item_name); diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp index d7d4654a6b5f4..6a22501c98aab 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp @@ -144,9 +144,6 @@ class BlockPointerSyntheticFrontEnd : public SyntheticChildrenFrontEnd { return lldb::ChildCacheState::eRefetch; } - // maybe return false if the block pointer is, say, null - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { if (!m_block_struct_type.IsValid()) return UINT32_MAX; diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp index 5e63d1d7b2145..76a10d2393782 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp @@ -199,11 +199,6 @@ lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t StdlibCoroutineHandleSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { if (!m_resume_ptr_sp || !m_destroy_ptr_sp) diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h index f9765f3255d2b..c33c82bd2fc45 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h +++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h @@ -40,8 +40,6 @@ class StdlibCoroutineHandleSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp index 33955dccb6ccc..f83f81fbdd1e7 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp @@ -32,7 +32,6 @@ class GenericBitsetFrontEnd : public SyntheticChildrenFrontEnd { return formatters::ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; llvm::Expected CalculateNumChildren() override { return m_elements.size(); diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp index a8a7c16de5e86..b224d3e859c84 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp @@ -42,7 +42,6 @@ class GenericOptionalFrontend : public SyntheticChildrenFrontEnd { return formatters::ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } llvm::Expected CalculateNumChildren() override { return m_has_value ? 1U : 0U; } diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp index 2aa8fdba70634..98e787dacc505 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp @@ -309,11 +309,6 @@ lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (name == "__ptr_") @@ -412,11 +407,6 @@ lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (name == "pointer") diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h index cb9ceaf093300..21fbb361eb934 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h @@ -102,8 +102,6 @@ class LibcxxSharedPtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; ~LibcxxSharedPtrSyntheticFrontEnd() override; @@ -122,8 +120,6 @@ class LibcxxUniquePtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; ~LibcxxUniquePtrSyntheticFrontEnd() override; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp index 7f30dc186291e..3104f33ee80b3 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp @@ -96,8 +96,6 @@ class LibcxxStdAtomicSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -119,11 +117,6 @@ lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - llvm::Expected lldb_private::formatters:: LibcxxStdAtomicSyntheticFrontEnd::CalculateNumChildren() { return m_real_child ? 1 : 0; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp index 67c6d1d3e5506..cd13455a2e460 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp @@ -32,8 +32,6 @@ class LibcxxInitializerListSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -99,11 +97,6 @@ lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp index f33b148249ab9..ae1ad2bfe7200 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp @@ -109,7 +109,6 @@ class AbstractListFrontEnd : public SyntheticChildrenFrontEnd { size_t GetIndexOfChildWithName(ConstString name) override { return ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; protected: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp index ebaf60a16b069..d75f25f49fdb4 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp @@ -197,8 +197,6 @@ class LibcxxStdMapSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -237,8 +235,6 @@ class LibCxxMapIteratorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; ~LibCxxMapIteratorSyntheticFrontEnd() override = default; @@ -397,11 +393,6 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { return ExtractIndexFromString(name.GetCString()); @@ -497,11 +488,6 @@ lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::GetChildAtIndex( return m_pair_sp->GetChildAtIndex(idx); } -bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_pair_sp) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxProxyArray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxProxyArray.cpp index c659adbb9ab2e..fdb8f07ec4006 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxProxyArray.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxProxyArray.cpp @@ -41,8 +41,6 @@ class LibcxxStdProxyArraySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -173,11 +171,6 @@ lldb_private::formatters::LibcxxStdProxyArraySyntheticFrontEnd::Update() { return ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdProxyArraySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdProxyArraySyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_base) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp index 5b459a17fe29b..8f1e35b3bede9 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp @@ -25,7 +25,6 @@ class QueueFrontEnd : public SyntheticChildrenFrontEnd { : UINT32_MAX; } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; llvm::Expected CalculateNumChildren() override { diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp index f3fe56525789a..e8ab37a022fbc 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp @@ -40,8 +40,6 @@ class LibcxxStdRangesRefViewSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { // We only have a single child return 0; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp index 5d607709d2c6f..523a7ab1001ec 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp @@ -62,8 +62,6 @@ class LibcxxStdSliceArraySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -145,11 +143,6 @@ lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::Update() { return ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp index 15040295efe6d..ab3a5cf954ec7 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp @@ -55,8 +55,6 @@ class LibcxxStdSpanSyntheticFrontEnd : public SyntheticChildrenFrontEnd { // from the only other place it can be: the template argument. lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -126,11 +124,6 @@ lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eReuse; } -bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp index 3e3259ab428df..263ca8349b891 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp @@ -24,7 +24,6 @@ class TupleFrontEnd: public SyntheticChildrenFrontEnd { return formatters::ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; llvm::Expected CalculateNumChildren() override { return m_elements.size(); diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp index be520ee27af06..395ecc489a17e 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp @@ -40,8 +40,6 @@ class LibcxxStdUnorderedMapSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -70,8 +68,6 @@ class LibCxxUnorderedMapIteratorSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -295,11 +291,6 @@ lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { return ExtractIndexFromString(name.GetCString()); @@ -407,11 +398,6 @@ lldb::ValueObjectSP lldb_private::formatters:: return lldb::ValueObjectSP(); } -bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (name == "first") diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp index 3f519f8c585f5..18c9c9b0e8710 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp @@ -30,8 +30,6 @@ class LibcxxStdValarraySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -124,11 +122,6 @@ lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::Update() { return ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start || !m_finish) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp index 62794318e0777..c3cb1fdcb4251 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp @@ -203,7 +203,6 @@ class VariantFrontEnd : public SyntheticChildrenFrontEnd { return formatters::ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; llvm::Expected CalculateNumChildren() override { return m_size; } ValueObjectSP GetChildAtIndex(uint32_t idx) override; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp index b762379a07d3a..ae3ed6326b45f 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp @@ -33,8 +33,6 @@ class LibcxxStdVectorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -54,8 +52,6 @@ class LibcxxVectorBoolSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -153,11 +149,6 @@ lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start || !m_finish) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp index 0a1877471916d..127c0cd6666a8 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp @@ -49,8 +49,6 @@ class LibstdcppMapIteratorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -70,8 +68,6 @@ class LibStdcppSharedPtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -149,8 +145,6 @@ LibstdcppMapIteratorSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { return lldb::ValueObjectSP(); } -bool LibstdcppMapIteratorSyntheticFrontEnd::MightHaveChildren() { return true; } - size_t LibstdcppMapIteratorSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { if (name == "first") @@ -232,8 +226,6 @@ VectorIteratorSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { return lldb::ValueObjectSP(); } -bool VectorIteratorSyntheticFrontEnd::MightHaveChildren() { return true; } - size_t VectorIteratorSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { if (name == "item") @@ -416,8 +408,6 @@ lldb::ChildCacheState LibStdcppSharedPtrSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool LibStdcppSharedPtrSyntheticFrontEnd::MightHaveChildren() { return true; } - size_t LibStdcppSharedPtrSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { if (name == "pointer") diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp index f59969d4cd7a1..68133b202a0c8 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp @@ -32,8 +32,6 @@ class LibStdcppTupleSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -86,8 +84,6 @@ lldb::ChildCacheState LibStdcppTupleSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool LibStdcppTupleSyntheticFrontEnd::MightHaveChildren() { return true; } - lldb::ValueObjectSP LibStdcppTupleSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { if (idx < m_members.size() && m_members[idx]) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp index 9447f7463f64a..209aaced23c7d 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp @@ -32,8 +32,6 @@ class LibStdcppUniquePtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; bool GetSummary(Stream &stream, const TypeSummaryOptions &options); @@ -113,8 +111,6 @@ lldb::ChildCacheState LibStdcppUniquePtrSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool LibStdcppUniquePtrSyntheticFrontEnd::MightHaveChildren() { return true; } - lldb::ValueObjectSP LibStdcppUniquePtrSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { if (idx == 0 && m_ptr_obj) diff --git a/lldb/source/Plugins/Language/ObjC/NSArray.cpp b/lldb/source/Plugins/Language/ObjC/NSArray.cpp index 072b8b5a6c860..7054dd8ffa952 100644 --- a/lldb/source/Plugins/Language/ObjC/NSArray.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSArray.cpp @@ -56,8 +56,6 @@ class NSArrayMSyntheticFrontEndBase : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override = 0; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; protected: @@ -220,8 +218,6 @@ class GenericNSArrayISyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -325,8 +321,6 @@ class NSArray1SyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; }; } // namespace formatters @@ -532,11 +526,6 @@ lldb_private::formatters::GenericNSArrayMSyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -bool -lldb_private::formatters::NSArrayMSyntheticFrontEndBase::MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::NSArrayMSyntheticFrontEndBase::GetIndexOfChildWithName( ConstString name) { @@ -674,13 +663,6 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd -bool -lldb_private::formatters::GenericNSArrayISyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - template lldb::ValueObjectSP lldb_private::formatters::GenericNSArrayISyntheticFrontEnd:: @@ -764,10 +746,6 @@ lldb_private::formatters::NSArray1SyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSArray1SyntheticFrontEnd::MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSArray1SyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { diff --git a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp index cf8750fd4976e..008e8eb569f01 100644 --- a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp @@ -109,8 +109,6 @@ class NSDictionaryISyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -150,8 +148,6 @@ class NSConstantDictionarySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -182,8 +178,6 @@ class NSCFDictionarySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -215,8 +209,6 @@ class NSDictionary1SyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -236,8 +228,6 @@ class GenericNSDictionaryMSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -269,8 +259,6 @@ namespace Foundation1100 { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -648,11 +636,6 @@ lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSDictionaryISyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -770,11 +753,6 @@ lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSCFDictionarySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -914,11 +892,6 @@ lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters:: NSConstantDictionarySyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { uint32_t num_children = CalculateNumChildrenIgnoringErrors(); @@ -1005,11 +978,6 @@ lldb_private::formatters::NSDictionary1SyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSDictionary1SyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSDictionary1SyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -1131,13 +1099,6 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd -bool -lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - template lldb::ValueObjectSP lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd< @@ -1292,12 +1253,6 @@ lldb::ChildCacheState lldb_private::formatters::Foundation1100:: : lldb::ChildCacheState::eRefetch; } -bool -lldb_private::formatters::Foundation1100:: - NSDictionaryMSyntheticFrontEnd::MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::Foundation1100:: NSDictionaryMSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { diff --git a/lldb/source/Plugins/Language/ObjC/NSError.cpp b/lldb/source/Plugins/Language/ObjC/NSError.cpp index bb54044ae1d61..5557daa2bf1b2 100644 --- a/lldb/source/Plugins/Language/ObjC/NSError.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSError.cpp @@ -165,8 +165,6 @@ class NSErrorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { return lldb::ChildCacheState::eRefetch; } - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { static ConstString g_userInfo("_userInfo"); if (name == g_userInfo) diff --git a/lldb/source/Plugins/Language/ObjC/NSException.cpp b/lldb/source/Plugins/Language/ObjC/NSException.cpp index b7d42bc5745e3..67f3f1779e147 100644 --- a/lldb/source/Plugins/Language/ObjC/NSException.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSException.cpp @@ -148,8 +148,6 @@ class NSExceptionSyntheticFrontEnd : public SyntheticChildrenFrontEnd { : lldb::ChildCacheState::eRefetch; } - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { // NSException has 4 members: // NSString *name; diff --git a/lldb/source/Plugins/Language/ObjC/NSSet.cpp b/lldb/source/Plugins/Language/ObjC/NSSet.cpp index a184ec624b63e..55069495676e5 100644 --- a/lldb/source/Plugins/Language/ObjC/NSSet.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSSet.cpp @@ -52,8 +52,6 @@ class NSSetISyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -90,8 +88,6 @@ class NSCFSetSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -123,8 +119,6 @@ class GenericNSSetMSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -225,24 +219,7 @@ namespace Foundation1437 { return __NSSetMSize_Impl(process, valobj_addr, error); } } -} - -class NSSetCodeRunningSyntheticFrontEnd : public SyntheticChildrenFrontEnd { -public: - NSSetCodeRunningSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp); - - ~NSSetCodeRunningSyntheticFrontEnd() override; - - llvm::Expected CalculateNumChildren() override; - - lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override; - - lldb::ChildCacheState Update() override; - - bool MightHaveChildren() override; - - size_t GetIndexOfChildWithName(ConstString name) override; -}; + } // namespace Foundation1437 } // namespace formatters } // namespace lldb_private @@ -461,10 +438,6 @@ lldb_private::formatters::NSSetISyntheticFrontEnd::Update() { return lldb::ChildCacheState::eReuse; } -bool lldb_private::formatters::NSSetISyntheticFrontEnd::MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSSetISyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -582,10 +555,6 @@ lldb_private::formatters::NSCFSetSyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSCFSetSyntheticFrontEnd::MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSCFSetSyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -739,13 +708,6 @@ lldb_private::formatters::GenericNSSetMSyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -template -bool -lldb_private::formatters:: - GenericNSSetMSyntheticFrontEnd::MightHaveChildren() { - return true; -} - template lldb::ValueObjectSP lldb_private::formatters:: From 83e180cb70266545f03a3449e4de0c3725fdfa55 Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Mon, 17 Feb 2025 13:46:37 -0600 Subject: [PATCH 046/127] [Clang][PGO] Fix profile function visibility bug (#127257) This pull request fixes an issue that was introduced in #93365. `__llvm_write_custom_profile` visibility was causing issues on Darwin. This function needs to be publicly accessible in order to be accessed by libomptarget, so this pull request makes `__llvm_write_custom_profile` an explicitly exported symbol on Darwin. Tested on M3 and X86 macs. --- clang/lib/Driver/ToolChains/Darwin.cpp | 14 +++++++++----- compiler-rt/lib/profile/InstrProfilingFile.c | 10 ++++------ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index b26c5bf1a909e..75f126965e0ac 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1480,11 +1480,15 @@ void Darwin::addProfileRTLibs(const ArgList &Args, // If we have a symbol export directive and we're linking in the profile // runtime, automatically export symbols necessary to implement some of the // runtime's functionality. - if (hasExportSymbolDirective(Args) && ForGCOV) { - addExportedSymbol(CmdArgs, "___gcov_dump"); - addExportedSymbol(CmdArgs, "___gcov_reset"); - addExportedSymbol(CmdArgs, "_writeout_fn_list"); - addExportedSymbol(CmdArgs, "_reset_fn_list"); + if (hasExportSymbolDirective(Args)) { + if (ForGCOV) { + addExportedSymbol(CmdArgs, "___gcov_dump"); + addExportedSymbol(CmdArgs, "___gcov_reset"); + addExportedSymbol(CmdArgs, "_writeout_fn_list"); + addExportedSymbol(CmdArgs, "_reset_fn_list"); + } else { + addExportedSymbol(CmdArgs, "___llvm_write_custom_profile"); + } } // Align __llvm_prf_{cnts,bits,data} sections to the maximum expected page diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 503d159fd9817..e5eca7947cf9b 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1362,12 +1362,10 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File, return 0; } -int __llvm_write_custom_profile(const char *Target, - const __llvm_profile_data *DataBegin, - const __llvm_profile_data *DataEnd, - const char *CountersBegin, - const char *CountersEnd, const char *NamesBegin, - const char *NamesEnd) { +COMPILER_RT_USED int __llvm_write_custom_profile( + const char *Target, const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, const char *NamesEnd) { int ReturnValue = 0, FilenameLength, TargetLength; char *FilenameBuf, *TargetFilename; const char *Filename; From f5c5bc5ed57e63fe11ddd58c4b392f27b86730f1 Mon Sep 17 00:00:00 2001 From: Akira Hatanaka Date: Mon, 17 Feb 2025 11:50:44 -0800 Subject: [PATCH 047/127] [CodeGen][ObjC] Invalidate cached ObjC class layout information after parsing ObjC class implementations if new ivars are added to the interface (#126591) The layout and the size of an ObjC interface can change after its corresponding implementation is parsed when synthesized ivars or ivars declared in categories are added to the interface's list of ivars. This can cause clang to mis-compile if the optimization that emits fixed offsets for ivars (see 923ddf65f4e21ec67018cf56e823895de18d83bc) uses an ObjC class layout that is outdated and no longer reflects the current state of the class. For example, when compiling `constant-non-fragile-ivar-offset.m`, clang emits 20 instead of 24 as the offset for `IntermediateClass2Property` as the class layout for `SuperClass2`, which is created when the implementation of IntermediateClass3 is parsed, is outdated when the implementation of `IntermediateClass2` is parsed. This commit invalidates the stale layout information of the class and its subclasses if new ivars are added to the interface. With this change, we can also stop using ObjC implementation decls as the key to retrieve ObjC class layouts information as the layout retrieved using the ObjC interface as the key will always be up to date. rdar://139531391 --- clang/include/clang/AST/ASTContext.h | 28 +++++------ clang/lib/AST/ASTContext.cpp | 27 +++++------ clang/lib/AST/RecordLayoutBuilder.cpp | 34 ++------------ clang/lib/CodeGen/CGObjCGNU.cpp | 13 +++-- clang/lib/CodeGen/CGObjCMac.cpp | 7 +-- clang/lib/CodeGen/CGObjCRuntime.cpp | 10 ++-- clang/lib/Sema/SemaDeclObjC.cpp | 7 +++ .../constant-non-fragile-ivar-offset.m | 47 +++++++++++++++++++ clang/test/CodeGenObjC/ivar-layout-64.m | 4 +- 9 files changed, 103 insertions(+), 74 deletions(-) diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index a96b9c0a17045..d275873651786 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -287,8 +287,8 @@ class ASTContext : public RefCountedBase { /// This is lazily created. This is intentionally not serialized. mutable llvm::DenseMap ASTRecordLayouts; - mutable llvm::DenseMap - ObjCLayouts; + mutable llvm::DenseMap + ObjCLayouts; /// A cache from types to size and alignment information. using TypeInfoMap = llvm::DenseMap; @@ -500,6 +500,11 @@ class ASTContext : public RefCountedBase { static constexpr unsigned GeneralTypesLog2InitSize = 9; static constexpr unsigned FunctionProtoTypesLog2InitSize = 12; + /// A mapping from an ObjC class to its subclasses. + llvm::DenseMap> + ObjCSubClasses; + ASTContext &this_() { return *this; } public: @@ -2671,13 +2676,6 @@ class ASTContext : public RefCountedBase { void DumpRecordLayout(const RecordDecl *RD, raw_ostream &OS, bool Simple = false) const; - /// Get or compute information about the layout of the specified - /// Objective-C implementation. - /// - /// This may differ from the interface if synthesized ivars are present. - const ASTRecordLayout & - getASTObjCImplementationLayout(const ObjCImplementationDecl *D) const; - /// Get our current best idea for the key function of the /// given record decl, or nullptr if there isn't one. /// @@ -2716,7 +2714,6 @@ class ASTContext : public RefCountedBase { /// Get the offset of an ObjCIvarDecl in bits. uint64_t lookupFieldBitOffset(const ObjCInterfaceDecl *OID, - const ObjCImplementationDecl *ID, const ObjCIvarDecl *Ivar) const; /// Find the 'this' offset for the member path in a pointer-to-member @@ -3174,7 +3171,12 @@ class ASTContext : public RefCountedBase { bool &CanUseFirst, bool &CanUseSecond, SmallVectorImpl &NewParamInfos); - void ResetObjCLayout(const ObjCContainerDecl *CD); + void ResetObjCLayout(const ObjCInterfaceDecl *D); + + void addObjCSubClass(const ObjCInterfaceDecl *D, + const ObjCInterfaceDecl *SubClass) { + ObjCSubClasses[D].push_back(SubClass); + } //===--------------------------------------------------------------------===// // Integer Predicates @@ -3564,9 +3566,7 @@ OPT_LIST(V) friend class DeclarationNameTable; friend class DeclContext; - const ASTRecordLayout & - getObjCLayout(const ObjCInterfaceDecl *D, - const ObjCImplementationDecl *Impl) const; + const ASTRecordLayout &getObjCLayout(const ObjCInterfaceDecl *D) const; /// A set of deallocations that should be performed when the /// ASTContext is destroyed. diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index b1b9d56ccca9f..7c70534388b4c 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -948,9 +948,11 @@ void ASTContext::cleanup() { // ASTRecordLayout objects in ASTRecordLayouts must always be destroyed // because they can contain DenseMaps. - for (llvm::DenseMap::iterator - I = ObjCLayouts.begin(), E = ObjCLayouts.end(); I != E; ) + for (llvm::DenseMap::iterator + I = ObjCLayouts.begin(), + E = ObjCLayouts.end(); + I != E;) // Increment in loop to prevent using deallocated memory. if (auto *R = const_cast((I++)->second)) R->Destroy(*this); @@ -3092,13 +3094,7 @@ TypeSourceInfo *ASTContext::getTrivialTypeSourceInfo(QualType T, const ASTRecordLayout & ASTContext::getASTObjCInterfaceLayout(const ObjCInterfaceDecl *D) const { - return getObjCLayout(D, nullptr); -} - -const ASTRecordLayout & -ASTContext::getASTObjCImplementationLayout( - const ObjCImplementationDecl *D) const { - return getObjCLayout(D->getClassInterface(), D); + return getObjCLayout(D); } static auto getCanonicalTemplateArguments(const ASTContext &C, @@ -8916,8 +8912,7 @@ static void EncodeBitField(const ASTContext *Ctx, std::string& S, uint64_t Offset; if (const auto *IVD = dyn_cast(FD)) { - Offset = Ctx->lookupFieldBitOffset(IVD->getContainingInterface(), nullptr, - IVD); + Offset = Ctx->lookupFieldBitOffset(IVD->getContainingInterface(), IVD); } else { const RecordDecl *RD = FD->getParent(); const ASTRecordLayout &RL = Ctx->getASTRecordLayout(RD); @@ -11848,8 +11843,12 @@ bool ASTContext::mergeExtParameterInfo( return true; } -void ASTContext::ResetObjCLayout(const ObjCContainerDecl *CD) { - ObjCLayouts[CD] = nullptr; +void ASTContext::ResetObjCLayout(const ObjCInterfaceDecl *D) { + if (ObjCLayouts.count(D)) { + ObjCLayouts[D] = nullptr; + for (auto *SubClass : ObjCSubClasses[D]) + ResetObjCLayout(SubClass); + } } /// mergeObjCGCQualifiers - This routine merges ObjC's GC attribute of 'LHS' and diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp index ae6d299024c6d..3e38ba0a43d98 100644 --- a/clang/lib/AST/RecordLayoutBuilder.cpp +++ b/clang/lib/AST/RecordLayoutBuilder.cpp @@ -3481,22 +3481,10 @@ uint64_t ASTContext::getFieldOffset(const ValueDecl *VD) const { } uint64_t ASTContext::lookupFieldBitOffset(const ObjCInterfaceDecl *OID, - const ObjCImplementationDecl *ID, const ObjCIvarDecl *Ivar) const { Ivar = Ivar->getCanonicalDecl(); const ObjCInterfaceDecl *Container = Ivar->getContainingInterface(); - - // FIXME: We should eliminate the need to have ObjCImplementationDecl passed - // in here; it should never be necessary because that should be the lexical - // decl context for the ivar. - - // If we know have an implementation (and the ivar is in it) then - // look up in the implementation layout. - const ASTRecordLayout *RL; - if (ID && declaresSameEntity(ID->getClassInterface(), Container)) - RL = &getASTObjCImplementationLayout(ID); - else - RL = &getASTObjCInterfaceLayout(Container); + const ASTRecordLayout *RL = &getASTObjCInterfaceLayout(Container); // Compute field index. // @@ -3522,8 +3510,7 @@ uint64_t ASTContext::lookupFieldBitOffset(const ObjCInterfaceDecl *OID, /// \param Impl - If given, also include the layout of the interface's /// implementation. This may differ by including synthesized ivars. const ASTRecordLayout & -ASTContext::getObjCLayout(const ObjCInterfaceDecl *D, - const ObjCImplementationDecl *Impl) const { +ASTContext::getObjCLayout(const ObjCInterfaceDecl *D) const { // Retrieve the definition if (D->hasExternalLexicalStorage() && !D->getDefinition()) getExternalSource()->CompleteType(const_cast(D)); @@ -3532,22 +3519,9 @@ ASTContext::getObjCLayout(const ObjCInterfaceDecl *D, "Invalid interface decl!"); // Look up this layout, if already laid out, return what we have. - const ObjCContainerDecl *Key = - Impl ? (const ObjCContainerDecl*) Impl : (const ObjCContainerDecl*) D; - if (const ASTRecordLayout *Entry = ObjCLayouts[Key]) + if (const ASTRecordLayout *Entry = ObjCLayouts[D]) return *Entry; - // Add in synthesized ivar count if laying out an implementation. - if (Impl) { - unsigned SynthCount = CountNonClassIvars(D); - // If there aren't any synthesized ivars then reuse the interface - // entry. Note we can't cache this because we simply free all - // entries later; however we shouldn't look up implementations - // frequently. - if (SynthCount == 0) - return getObjCLayout(D, nullptr); - } - ItaniumRecordLayoutBuilder Builder(*this, /*EmptySubobjects=*/nullptr); Builder.Layout(D); @@ -3557,7 +3531,7 @@ ASTContext::getObjCLayout(const ObjCInterfaceDecl *D, /*RequiredAlignment : used by MS-ABI)*/ Builder.Alignment, Builder.getDataSize(), Builder.FieldOffsets); - ObjCLayouts[Key] = NewEntry; + ObjCLayouts[D] = NewEntry; return *NewEntry; } diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp index ebd88bb38849e..d1876f47c0eea 100644 --- a/clang/lib/CodeGen/CGObjCGNU.cpp +++ b/clang/lib/CodeGen/CGObjCGNU.cpp @@ -1826,9 +1826,11 @@ class CGObjCGNUstep2 : public CGObjCGNUstep { Context.getASTObjCInterfaceLayout(SuperClassDecl).getSize().getQuantity(); // Instance size is negative for classes that have not yet had their ivar // layout calculated. - classFields.addInt(LongTy, - 0 - (Context.getASTObjCImplementationLayout(OID).getSize().getQuantity() - - superInstanceSize)); + classFields.addInt( + LongTy, 0 - (Context.getASTObjCInterfaceLayout(OID->getClassInterface()) + .getSize() + .getQuantity() - + superInstanceSize)); if (classDecl->all_declared_ivar_begin() == nullptr) classFields.addNullPointer(PtrTy); @@ -3639,8 +3641,9 @@ void CGObjCGNU::GenerateClass(const ObjCImplementationDecl *OID) { } // Get the size of instances. - int instanceSize = - Context.getASTObjCImplementationLayout(OID).getSize().getQuantity(); + int instanceSize = Context.getASTObjCInterfaceLayout(OID->getClassInterface()) + .getSize() + .getQuantity(); // Collect information about instance variables. SmallVector IvarNames; diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 16986de96bdbc..01552b6e53d00 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -3439,8 +3439,9 @@ void CGObjCMac::GenerateClass(const ObjCImplementationDecl *ID) { else if ((hasMRCWeak = hasMRCWeakIvars(CGM, ID))) Flags |= FragileABI_Class_HasMRCWeakIvars; - CharUnits Size = - CGM.getContext().getASTObjCImplementationLayout(ID).getSize(); + CharUnits Size = CGM.getContext() + .getASTObjCInterfaceLayout(ID->getClassInterface()) + .getSize(); // FIXME: Set CXX-structors flag. if (ID->getClassInterface()->getVisibility() == HiddenVisibility) @@ -6330,7 +6331,7 @@ void CGObjCNonFragileABIMac::GetClassSizeInfo(const ObjCImplementationDecl *OID, uint32_t &InstanceStart, uint32_t &InstanceSize) { const ASTRecordLayout &RL = - CGM.getContext().getASTObjCImplementationLayout(OID); + CGM.getContext().getASTObjCInterfaceLayout(OID->getClassInterface()); // InstanceSize is really instance end. InstanceSize = RL.getDataSize().getQuantity(); diff --git a/clang/lib/CodeGen/CGObjCRuntime.cpp b/clang/lib/CodeGen/CGObjCRuntime.cpp index a7f5c913f42fc..dfb0fd14d93ac 100644 --- a/clang/lib/CodeGen/CGObjCRuntime.cpp +++ b/clang/lib/CodeGen/CGObjCRuntime.cpp @@ -31,15 +31,14 @@ using namespace CodeGen; uint64_t CGObjCRuntime::ComputeIvarBaseOffset(CodeGen::CodeGenModule &CGM, const ObjCInterfaceDecl *OID, const ObjCIvarDecl *Ivar) { - return CGM.getContext().lookupFieldBitOffset(OID, nullptr, Ivar) / + return CGM.getContext().lookupFieldBitOffset(OID, Ivar) / CGM.getContext().getCharWidth(); } uint64_t CGObjCRuntime::ComputeIvarBaseOffset(CodeGen::CodeGenModule &CGM, const ObjCImplementationDecl *OID, const ObjCIvarDecl *Ivar) { - return CGM.getContext().lookupFieldBitOffset(OID->getClassInterface(), OID, - Ivar) / + return CGM.getContext().lookupFieldBitOffset(OID->getClassInterface(), Ivar) / CGM.getContext().getCharWidth(); } @@ -47,8 +46,7 @@ unsigned CGObjCRuntime::ComputeBitfieldBitOffset( CodeGen::CodeGenModule &CGM, const ObjCInterfaceDecl *ID, const ObjCIvarDecl *Ivar) { - return CGM.getContext().lookupFieldBitOffset(ID, ID->getImplementation(), - Ivar); + return CGM.getContext().lookupFieldBitOffset(ID, Ivar); } LValue CGObjCRuntime::EmitValueForIvarAtOffset(CodeGen::CodeGenFunction &CGF, @@ -86,7 +84,7 @@ LValue CGObjCRuntime::EmitValueForIvarAtOffset(CodeGen::CodeGenFunction &CGF, // non-synthesized ivars but we may be called for synthesized ivars. However, // a synthesized ivar can never be a bit-field, so this is safe. uint64_t FieldBitOffset = - CGF.CGM.getContext().lookupFieldBitOffset(OID, nullptr, Ivar); + CGF.CGM.getContext().lookupFieldBitOffset(OID, Ivar); uint64_t BitOffset = FieldBitOffset % CGF.CGM.getContext().getCharWidth(); uint64_t AlignmentBits = CGF.CGM.getTarget().getCharAlign(); uint64_t BitFieldSize = Ivar->getBitWidthValue(); diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp index e665d0293dc84..ba9d3dcf19617 100644 --- a/clang/lib/Sema/SemaDeclObjC.cpp +++ b/clang/lib/Sema/SemaDeclObjC.cpp @@ -659,6 +659,7 @@ void SemaObjC::ActOnSuperClassOfClassInterface( IDecl->setSuperClass(SuperClassTInfo); IDecl->setEndOfDefinitionLoc(SuperClassTInfo->getTypeLoc().getEndLoc()); + getASTContext().addObjCSubClass(IDecl->getSuperClass(), IDecl); } } @@ -2129,6 +2130,12 @@ SemaObjC::ActOnFinishObjCImplementation(Decl *ObjCImpDecl, DeclsInGroup.push_back(ObjCImpDecl); + // Reset the cached layout if there are any ivars added to + // the implementation. + if (auto *ImplD = dyn_cast(ObjCImpDecl)) + if (!ImplD->ivar_empty()) + getASTContext().ResetObjCLayout(ImplD->getClassInterface()); + return SemaRef.BuildDeclaratorGroup(DeclsInGroup); } diff --git a/clang/test/CodeGenObjC/constant-non-fragile-ivar-offset.m b/clang/test/CodeGenObjC/constant-non-fragile-ivar-offset.m index 8d55e6c7d2308..bc076b4656c9d 100644 --- a/clang/test/CodeGenObjC/constant-non-fragile-ivar-offset.m +++ b/clang/test/CodeGenObjC/constant-non-fragile-ivar-offset.m @@ -9,6 +9,9 @@ // CHECK: @"OBJC_IVAR_$_SubClass.subClassIvar" = constant i64 56 // CHECK: @"OBJC_IVAR_$_SubClass._subClassProperty" = hidden constant i64 64 // CHECK: @"OBJC_IVAR_$_NotStaticLayout.not_static_layout_ivar" = hidden global i64 12 +// CHECK: @"OBJC_IVAR_$_SuperClass2._superClassProperty2" = hidden constant i64 20 +// CHECK: @"OBJC_IVAR_$_IntermediateClass2._IntermediateClass2Property" = hidden constant i64 24 +// CHECK: @"OBJC_IVAR_$_SubClass2._subClass2Property" = hidden constant i64 28 @interface NSObject { int these, will, never, change, ever; @@ -138,3 +141,47 @@ -(void)meth { // CHECK: load i64, ptr @"OBJC_IVAR_$_NotStaticLayout.not_static_layout_ivar } @end + +// CHECK: define internal i32 @"\01-[IntermediateClass2 IntermediateClass2Property]"(ptr noundef %[[SELF:.*]], +// CHECK: %[[SELF_ADDR:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[SELF]], ptr %[[SELF_ADDR]], align 8 +// CHECK: %[[V0:.*]] = load ptr, ptr %[[SELF_ADDR]], align 8 +// CHECK: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[V0]], i64 24 +// CHECK: %[[LOAD:.*]] = load atomic i32, ptr %[[ADD_PTR]] unordered, align 4 +// CHECK: ret i32 %[[LOAD]] + +// CHECK: define internal i32 @"\01-[SubClass2 subClass2Property]"(ptr noundef %[[SELF:.*]], +// CHECK: %[[SELF_ADDR:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[SELF]], ptr %[[SELF_ADDR]], align 8 +// CHECK: %[[V0:.*]] = load ptr, ptr %[[SELF_ADDR]], align 8 +// CHECK: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[V0]], i64 28 +// CHECK: %[[LOAD:.*]] = load atomic i32, ptr %[[ADD_PTR]] unordered, align 4 +// CHECK: ret i32 %[[LOAD]] + +@interface SuperClass2 : NSObject +@property int superClassProperty2; +@end + +@interface IntermediateClass2 : SuperClass2 +@property int IntermediateClass2Property; +@end + +@interface IntermediateClass3 : SuperClass2 +@property int IntermediateClass3Property; +@end + +@interface SubClass2 : IntermediateClass2 +@property int subClass2Property; +@end + +@implementation IntermediateClass3 +@end + +@implementation SuperClass2 +@end + +@implementation IntermediateClass2 +@end + +@implementation SubClass2 +@end diff --git a/clang/test/CodeGenObjC/ivar-layout-64.m b/clang/test/CodeGenObjC/ivar-layout-64.m index d3ffdfe444c8b..409434ca3bef3 100644 --- a/clang/test/CodeGenObjC/ivar-layout-64.m +++ b/clang/test/CodeGenObjC/ivar-layout-64.m @@ -63,8 +63,8 @@ @interface D : A @end // CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"D\00" -// CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"\11p\00" -// CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"!`\00" +// CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"\11\A0\00" +// CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"!\90\00" @implementation D @synthesize p3 = _p3; From 3fa85c7cce3047ae0fc54874c2bf8340e8c4803c Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 17 Feb 2025 14:51:47 -0500 Subject: [PATCH 048/127] [libc++] Document that libc++ does not and will never implement the Networking TS (#127508) There has been discussion around this a few times already, and there seemed to be consensus that we would never pursue an implementation of the Networking TS. This patch solidifies that discussion by documenting it and closing issues related to the Networking TS. Closes #103799 Closes #100223 Closes #100228 Closes #100231 Closes #100232 --- libcxx/docs/DesignDocs/ExperimentalFeatures.rst | 4 ++-- libcxx/docs/Status/Cxx20Issues.csv | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libcxx/docs/DesignDocs/ExperimentalFeatures.rst b/libcxx/docs/DesignDocs/ExperimentalFeatures.rst index dc2ae6a25aa5d..0dbbd5f869e36 100644 --- a/libcxx/docs/DesignDocs/ExperimentalFeatures.rst +++ b/libcxx/docs/DesignDocs/ExperimentalFeatures.rst @@ -160,8 +160,8 @@ has been removed in LLVM 17.0. `Networking TS `__ ------------------------------------------- -The Networking TS is not yet part of a shipping standard. -We have not yet shipped an implementation of the Networking TS. +The Networking TS is not yet part of a shipping standard, and there is discussion around removing it. +Libc++ never shipped an implementation of the Networking TS and does not plan to do so in the future. `Ranges TS `__ --------------------------------------- diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index 3462557e8d668..1b8e76d90d9ef 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -13,7 +13,7 @@ "`LWG2966 `__","Incomplete resolution of US 74","2017-07 (Toronto)","|Nothing To Do|","","" "`LWG2974 `__","Diagnose out of bounds ``tuple_element/variant_alternative``\ ","2017-07 (Toronto)","|Complete|","","" "","","","","","" -"`LWG2779 `__","[networking.ts] Relax requirements on buffer sequence iterators","2017-11 (Albuquerque)","","","" +"`LWG2779 `__","[networking.ts] Relax requirements on buffer sequence iterators","2017-11 (Albuquerque)","|Nothing To Do|","","" "`LWG2870 `__","Default value of parameter theta of polar should be dependent","2017-11 (Albuquerque)","|Complete|","","" "`LWG2935 `__","What should create_directories do when p already exists but is not a directory?","2017-11 (Albuquerque)","|Nothing To Do|","","" "`LWG2941 `__","[thread.req.timing] wording should apply to both member and namespace-level functions","2017-11 (Albuquerque)","|Nothing To Do|","","" @@ -51,17 +51,17 @@ "`LWG2975 `__","Missing case for ``pair``\ construction in scoped and polymorphic allocators","2018-03 (Jacksonville)","","","" "`LWG2989 `__","``path``\ 's stream insertion operator lets you insert everything under the sun","2018-03 (Jacksonville)","|Complete|","","" "`LWG3000 `__","``monotonic_memory_resource::do_is_equal``\ uses ``dynamic_cast``\ unnecessarily","2018-03 (Jacksonville)","|Complete|","16","" -"`LWG3002 `__","[networking.ts] ``basic_socket_acceptor::is_open()``\ isn't ``noexcept``\ ","2018-03 (Jacksonville)","","","" +"`LWG3002 `__","[networking.ts] ``basic_socket_acceptor::is_open()``\ isn't ``noexcept``\ ","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3004 `__","|sect|\ [string.capacity] and |sect|\ [vector.capacity] should specify time complexity for ``capacity()``\ ","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3005 `__","Destruction order of arrays by ``make_shared/allocate_shared``\ only recommended?","2018-03 (Jacksonville)","","","" "`LWG3007 `__","``allocate_shared``\ should rebind allocator to *cv*-unqualified ``value_type``\ for construction","2018-03 (Jacksonville)","","","" "`LWG3009 `__","Including ````\ doesn't provide ``std::size/empty/data``\ ","2018-03 (Jacksonville)","|Complete|","","" -"`LWG3010 `__","[networking.ts] ``uses_executor``\ says ""if a type ``T::executor_type``\ exists""","2018-03 (Jacksonville)","","","" +"`LWG3010 `__","[networking.ts] ``uses_executor``\ says ""if a type ``T::executor_type``\ exists""","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3013 `__","``(recursive_)directory_iterator``\ construction and traversal should not be ``noexcept``\ ","2018-03 (Jacksonville)","|Complete|","","" "`LWG3014 `__","More ``noexcept``\ issues with filesystem operations","2018-03 (Jacksonville)","|Complete|","","" "`LWG3015 `__","``copy_options::*unspecified*``\ underspecified","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3017 `__","``list splice``\ functions should use ``addressof``\ ","2018-03 (Jacksonville)","|Complete|","","" -"`LWG3020 `__","[networking.ts] Remove spurious nested ``value_type``\ buffer sequence requirement","2018-03 (Jacksonville)","","","" +"`LWG3020 `__","[networking.ts] Remove spurious nested ``value_type``\ buffer sequence requirement","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3026 `__","``filesystem::weakly_canonical``\ still defined in terms of ``canonical(p, base)``\ ","2018-03 (Jacksonville)","|Complete|","","" "`LWG3030 `__","Who shall meet the requirements of ``try_lock``\ ?","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3034 `__","P0767R1 breaks previously-standard-layout types","2018-03 (Jacksonville)","|Complete|","","" From a7a356833df81b605ecaa3b0a7391da68805b680 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Mon, 17 Feb 2025 22:05:22 +0200 Subject: [PATCH 049/127] [NFC][Clang][CodeGen] Remove vestigial assertion (#127528) This removes a vestigial assertion, which would erroneously trigger even though we now correctly handle valid arg mismatches (), after #114062 went in. --- clang/lib/CodeGen/CGCall.cpp | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index e6c2ac939eb88..47bfd470dbafb 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5633,22 +5633,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, if (!CallArgs.getCleanupsToDeactivate().empty()) deactivateArgCleanupsBeforeCall(*this, CallArgs); - // Assert that the arguments we computed match up. The IR verifier - // will catch this, but this is a common enough source of problems - // during IRGen changes that it's way better for debugging to catch - // it ourselves here. -#ifndef NDEBUG - assert(IRCallArgs.size() == IRFuncTy->getNumParams() || IRFuncTy->isVarArg()); - for (unsigned i = 0; i < IRCallArgs.size(); ++i) { - // Inalloca argument can have different type. - if (IRFunctionArgs.hasInallocaArg() && - i == IRFunctionArgs.getInallocaArgNo()) - continue; - if (i < IRFuncTy->getNumParams()) - assert(IRCallArgs[i]->getType() == IRFuncTy->getParamType(i)); - } -#endif - // Update the largest vector width if any arguments have vector types. for (unsigned i = 0; i < IRCallArgs.size(); ++i) LargestVectorWidth = std::max(LargestVectorWidth, From 9a584b07d7c29cec65bb446782c4ddddaf72e6d8 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 17 Feb 2025 14:06:24 -0600 Subject: [PATCH 050/127] [Clang] Add handlers for 'match_any' and 'match_all' to `gpuintrin.h` (#127504) Summary: These helpers are very useful but currently absent. They allow the user to get a bitmask representing the matches within the warp. I have made an executive decision to drop the `predicate` return from `match_all` because it's easily testable with `match_all() == __activemask()`. --- clang/lib/Headers/amdgpuintrin.h | 56 ++++++++++++++ clang/lib/Headers/nvptxintrin.h | 74 +++++++++++++++++++ libc/src/__support/GPU/utils.h | 8 ++ .../src/__support/GPU/CMakeLists.txt | 9 +++ .../integration/src/__support/GPU/match.cpp | 35 +++++++++ 5 files changed, 182 insertions(+) create mode 100644 libc/test/integration/src/__support/GPU/match.cpp diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 9dad99ffe9439..355e75d0b2d42 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width)); } +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + uint32_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, !__done)) { + if (!__done) { + uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + uint64_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, __done)) { + if (!__done) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + __gpu_sync_lane(__lane_mask); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + __gpu_sync_lane(__lane_mask); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + // Returns true if the flat pointer points to AMDGPU 'shared' memory. _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)(( diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index 40fa2edebe975..f857a87b5f4c7 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -13,6 +13,10 @@ #error "This file is intended for NVPTX targets or offloading to NVPTX" #endif +#ifndef __CUDA_ARCH__ +#define __CUDA_ARCH__ 0 +#endif + #include #if !defined(__cplusplus) @@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width)); } +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + // Newer targets can use the dedicated CUDA support. + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_any_sync_i32(__lane_mask, __x); + + uint32_t __match_mask = 0; + bool __done = 0; + while (__gpu_ballot(__lane_mask, !__done)) { + if (!__done) { + uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + return __match_mask; +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + // Newer targets can use the dedicated CUDA support. + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_any_sync_i64(__lane_mask, __x); + + uint64_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, __done)) { + if (!__done) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + // Newer targets can use the dedicated CUDA support. + int predicate; + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate); + + uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + // Newer targets can use the dedicated CUDA support. + int predicate; + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate); + + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + // Returns true if the flat pointer points to CUDA 'shared' memory. _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { return __nvvm_isspacep_shared(ptr); diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index 323c003f1ff07..0fd3a6498b865 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x, return __gpu_shuffle_idx_u32(lane_mask, idx, x, width); } +LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) { + return __gpu_match_any_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) { + return __gpu_match_all_u32(lane_mask, x); +} + [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } LIBC_INLINE bool is_first_lane(uint64_t lane_mask) { diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt index 68bbc3849bc7e..e066830f6cc0d 100644 --- a/libc/test/integration/src/__support/GPU/CMakeLists.txt +++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt @@ -18,3 +18,12 @@ add_integration_test( LOADER_ARGS --threads 64 ) + +add_integration_test( + match_test + SUITE libc-support-gpu-tests + SRCS + match.cpp + LOADER_ARGS + --threads 64 +) diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp new file mode 100644 index 0000000000000..0eadb1364eec7 --- /dev/null +++ b/libc/test/integration/src/__support/GPU/match.cpp @@ -0,0 +1,35 @@ +//===-- Test for the shuffle operations on the GPU ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/bit.h" +#include "src/__support/GPU/utils.h" +#include "test/IntegrationTest/test.h" + +using namespace LIBC_NAMESPACE; + +// Test to ensure that match any / match all work. +static void test_match() { + uint64_t mask = gpu::get_lane_mask(); + EXPECT_EQ(1ull << gpu::get_lane_id(), + gpu::match_any(mask, gpu::get_lane_id())); + EXPECT_EQ(mask, gpu::match_any(mask, 1)); + + uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000; + EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16)); + EXPECT_EQ(mask, gpu::match_all(mask, 1)); + EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id())); +} + +TEST_MAIN(int argc, char **argv, char **envp) { + if (gpu::get_thread_id() >= gpu::get_lane_size()) + return 0; + + test_match(); + + return 0; +} From a8b177aa6048e3dc278f63f4bc79e2c199ecd722 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 17 Feb 2025 21:14:01 +0100 Subject: [PATCH 051/127] [LAA] Remove unneeded hasNoOverflow call (NFC). The function already calls hasNoOverflow above. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 23bfd9989469a..90db89f745e89 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -886,7 +886,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, return true; } - return PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); + return false; } static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, From a92bfaa7d92180c3c88b2c116689de30a72546c2 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Mon, 17 Feb 2025 21:34:12 +0100 Subject: [PATCH 052/127] [LLD][COFF] Support MinGW constructor and destructor lists on ARM64X (#127205) Split the chunks for EC and native views, inserting headers and tails for both. --- lld/COFF/Chunks.cpp | 8 +++- lld/COFF/Chunks.h | 7 +-- lld/COFF/Writer.cpp | 37 +++++++++------- lld/test/COFF/arm64x-ctors-sec.s | 76 ++++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 20 deletions(-) create mode 100644 lld/test/COFF/arm64x-ctors-sec.s diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index a01c69c709876..3494d1ba0ac02 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -1070,16 +1070,20 @@ void MergeChunk::writeTo(uint8_t *buf) const { } // MinGW specific. -size_t AbsolutePointerChunk::getSize() const { return ctx.config.wordsize; } +size_t AbsolutePointerChunk::getSize() const { + return symtab.ctx.config.wordsize; +} void AbsolutePointerChunk::writeTo(uint8_t *buf) const { - if (ctx.config.is64()) { + if (symtab.ctx.config.is64()) { write64le(buf, value); } else { write32le(buf, value); } } +MachineTypes AbsolutePointerChunk::getMachine() const { return symtab.machine; } + void ECExportThunkChunk::writeTo(uint8_t *buf) const { memcpy(buf, ECExportThunkCode, sizeof(ECExportThunkCode)); write32le(buf + 10, target->getRVA() - rva - 14); diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h index d6216efdd90bd..06e9aae0e6f6e 100644 --- a/lld/COFF/Chunks.h +++ b/lld/COFF/Chunks.h @@ -910,16 +910,17 @@ class PseudoRelocTableChunk : public NonSectionChunk { // MinGW specific. A Chunk that contains one pointer-sized absolute value. class AbsolutePointerChunk : public NonSectionChunk { public: - AbsolutePointerChunk(COFFLinkerContext &ctx, uint64_t value) - : value(value), ctx(ctx) { + AbsolutePointerChunk(SymbolTable &symtab, uint64_t value) + : value(value), symtab(symtab) { setAlignment(getSize()); } size_t getSize() const override; void writeTo(uint8_t *buf) const override; + MachineTypes getMachine() const override; private: uint64_t value; - COFFLinkerContext &ctx; + SymbolTable &symtab; }; // Return true if this file has the hotpatch flag set to true in the S_COMPILE3 diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 504558087c80d..58727c1615769 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -2335,21 +2335,28 @@ void Writer::createRuntimePseudoRelocs() { // There's a symbol pointing to the start sentinel pointer, __CTOR_LIST__ // and __DTOR_LIST__ respectively. void Writer::insertCtorDtorSymbols() { - AbsolutePointerChunk *ctorListHead = make(ctx, -1); - AbsolutePointerChunk *ctorListEnd = make(ctx, 0); - AbsolutePointerChunk *dtorListHead = make(ctx, -1); - AbsolutePointerChunk *dtorListEnd = make(ctx, 0); - ctorsSec->insertChunkAtStart(ctorListHead); - ctorsSec->addChunk(ctorListEnd); - dtorsSec->insertChunkAtStart(dtorListHead); - dtorsSec->addChunk(dtorListEnd); - - Symbol *ctorListSym = ctx.symtab.findUnderscore("__CTOR_LIST__"); - Symbol *dtorListSym = ctx.symtab.findUnderscore("__DTOR_LIST__"); - replaceSymbol(ctorListSym, ctorListSym->getName(), - ctorListHead); - replaceSymbol(dtorListSym, dtorListSym->getName(), - dtorListHead); + ctx.forEachSymtab([&](SymbolTable &symtab) { + AbsolutePointerChunk *ctorListHead = make(symtab, -1); + AbsolutePointerChunk *ctorListEnd = make(symtab, 0); + AbsolutePointerChunk *dtorListHead = make(symtab, -1); + AbsolutePointerChunk *dtorListEnd = make(symtab, 0); + ctorsSec->insertChunkAtStart(ctorListHead); + ctorsSec->addChunk(ctorListEnd); + dtorsSec->insertChunkAtStart(dtorListHead); + dtorsSec->addChunk(dtorListEnd); + + Symbol *ctorListSym = symtab.findUnderscore("__CTOR_LIST__"); + Symbol *dtorListSym = symtab.findUnderscore("__DTOR_LIST__"); + replaceSymbol(ctorListSym, ctorListSym->getName(), + ctorListHead); + replaceSymbol(dtorListSym, dtorListSym->getName(), + dtorListHead); + }); + + if (ctx.hybridSymtab) { + ctorsSec->splitECChunks(); + dtorsSec->splitECChunks(); + } } // Handles /section options to allow users to overwrite diff --git a/lld/test/COFF/arm64x-ctors-sec.s b/lld/test/COFF/arm64x-ctors-sec.s new file mode 100644 index 0000000000000..283d5f045260d --- /dev/null +++ b/lld/test/COFF/arm64x-ctors-sec.s @@ -0,0 +1,76 @@ +// REQUIRES: aarch64, x86 +// RUN: split-file %s %t.dir && cd %t.dir + +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows ctor1-arm64.s -o ctor1-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows ctor2-arm64.s -o ctor2-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ctor1-arm64ec.s -o ctor1-arm64ec.obj +// RUN: llvm-mc -filetype=obj -triple=x86_64-windows ctor2-amd64.s -o ctor2-amd64.obj +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows test.s -o test-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test-arm64ec.obj + +// Check that .ctors and .dtors chunks are correctly sorted and that EC and native chunks are split. + +// RUN: lld-link -out:out.dll -machine:arm64x -lldmingw -dll -noentry test-arm64.obj test-arm64ec.obj \ +// RUN: ctor1-arm64.obj ctor2-arm64.obj ctor1-arm64ec.obj ctor2-amd64.obj +// RUN: llvm-readobj --hex-dump=.rdata --hex-dump=.test out.dll | FileCheck %s + +// RUN: lld-link -out:out2.dll -machine:arm64x -lldmingw -dll -noentry test-arm64.obj test-arm64ec.obj \ +// RUN: ctor1-arm64ec.obj ctor2-amd64.obj ctor1-arm64.obj ctor2-arm64.obj +// RUN: llvm-readobj --hex-dump=.rdata --hex-dump=.test out2.dll | FileCheck %s + +// RUN: lld-link -out:out3.dll -machine:arm64x -lldmingw -dll -noentry test-arm64.obj test-arm64ec.obj \ +// RUN: ctor2-arm64.obj ctor1-arm64ec.obj ctor2-amd64.obj ctor1-arm64.obj +// RUN: llvm-readobj --hex-dump=.rdata --hex-dump=.test out3.dll | FileCheck %s + +// CHECK: Hex dump of section '.rdata': +// CHECK-NEXT: 0x180001000 ffffffff ffffffff 01000000 00000000 +// CHECK-NEXT: 0x180001010 02000000 00000000 03000000 00000000 +// CHECK-NEXT: 0x180001020 00000000 00000000 ffffffff ffffffff +// CHECK-NEXT: 0x180001030 11000000 00000000 12000000 00000000 +// CHECK-NEXT: 0x180001040 13000000 00000000 00000000 00000000 +// CHECK-NEXT: 0x180001050 ffffffff ffffffff 01010000 00000000 +// CHECK-NEXT: 0x180001060 02010000 00000000 03010000 00000000 +// CHECK-NEXT: 0x180001070 00000000 00000000 ffffffff ffffffff +// CHECK-NEXT: 0x180001080 11010000 00000000 12010000 00000000 +// CHECK-NEXT: 0x180001090 13010000 00000000 00000000 00000000 +// CHECK-EMPTY: +// CHECK-NEXT: Hex dump of section '.test': +// CHECK-NEXT: 0x180003000 00100000 50100000 28100000 78100000 + +#--- ctor1-arm64.s + .section .ctors.1,"drw" + .xword 1 + .section .ctors.3,"drw" + .xword 3 + .section .dtors.1,"drw" + .xword 0x101 + .section .dtors.3,"drw" + .xword 0x103 + +#--- ctor2-arm64.s + .section .ctors.2,"drw" + .xword 2 + .section .dtors.2,"drw" + .xword 0x102 + +#--- ctor1-arm64ec.s + .section .ctors.1,"drw" + .xword 0x11 + .section .ctors.3,"drw" + .xword 0x13 + .section .dtors.1,"drw" + .xword 0x111 + .section .dtors.3,"drw" + .xword 0x113 + +#--- ctor2-amd64.s + .section .ctors.2,"drw" + .quad 0x12 + .section .dtors.2,"drw" + .quad 0x112 + +#--- test.s + .section .test + .rva __CTOR_LIST__ + .rva __DTOR_LIST__ + From 620a51535ba7934a6f6bca5b74ff75946e886d87 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 17 Feb 2025 21:36:53 +0100 Subject: [PATCH 053/127] [VPlan] Add message to assert in HCFGBuilder (NFC). Suggested in https://github.com/llvm/llvm-project/pull/126388. --- llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 33a367a0b65c1..70d8575ba82c5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -134,7 +134,8 @@ void PlainCFGBuilder::fixPhiNodes() { if (isHeaderBB(Phi->getParent(), L)) { // For header phis, make sure the incoming value from the loop // predecessor is the first operand of the recipe. - assert(Phi->getNumOperands() == 2); + assert(Phi->getNumOperands() == 2 && + "header phi must have exactly 2 operands"); BasicBlock *LoopPred = L->getLoopPredecessor(); VPPhi->addOperand( getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred))); From 88e72c401b5bbedb3461039935e940eccff53f02 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 17 Feb 2025 21:49:39 +0100 Subject: [PATCH 054/127] [LAA] Add test where GEPs may wrap. --- .../runtime-checks-may-wrap.ll | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll diff --git a/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll b/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll new file mode 100644 index 0000000000000..b27937862b261 --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "p:16:16" + +define void @geps_may_wrap(ptr %a, ptr %b, i64 %N) { +; CHECK-LABEL: 'geps_may_wrap' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.iv = getelementptr i32, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): +; CHECK-NEXT: ptr %b +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP1]]: +; CHECK-NEXT: (Low: %a High: (16 + (12 * (trunc i128 ((zext i64 %N to i128) /u 3) to i16)) + %a)) +; CHECK-NEXT: Member: {%a,+,12}<%loop> +; CHECK-NEXT: Group [[GRP2]]: +; CHECK-NEXT: (Low: %b High: (4 + %b)) +; CHECK-NEXT: Member: %b +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {0,+,3}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.iv = getelementptr i32, ptr %a, i64 %iv + store i32 0, ptr %gep.iv, align 1 + store i32 0, ptr %b, align 1 + %iv.next = add i64 %iv, 3 + %.not = icmp ult i64 %N, %iv + br i1 %.not, label %exit, label %loop + +exit: + ret void +} From 93a1184409feb6b0ebb1001e1bebcecf760a6673 Mon Sep 17 00:00:00 2001 From: Henrich Lauko Date: Mon, 17 Feb 2025 22:44:10 +0100 Subject: [PATCH 055/127] [MLIR][LLVM] Fix import split marker in alias test (#127540) --- mlir/test/Target/LLVMIR/Import/alias.ll | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/test/Target/LLVMIR/Import/alias.ll b/mlir/test/Target/LLVMIR/Import/alias.ll index 9f86da3ecc71c..3ab68a7d8fb81 100644 --- a/mlir/test/Target/LLVMIR/Import/alias.ll +++ b/mlir/test/Target/LLVMIR/Import/alias.ll @@ -12,7 +12,7 @@ entry: ret ptr null } -; ----- +; // ----- @zed = global i32 42 @foo = alias i32, ptr @zed @@ -27,7 +27,7 @@ entry: ; CHECK: llvm.return %[[ADDR]] : !llvm.ptr ; CHECK: } -; ----- +; // ----- @v1 = global i32 0 @a3 = alias i32, addrspacecast (ptr @v1 to ptr addrspace(2)) @@ -37,7 +37,7 @@ entry: ; CHECK: llvm.return %[[CASTED_ADDR]] : !llvm.ptr<2> ; CHECK: } -; ----- +; // ----- @some_name = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr null] } @vtable = alias { [3 x ptr] }, ptr @some_name @@ -47,7 +47,7 @@ entry: ; CHECK: llvm.return %[[ADDR]] : !llvm.ptr ; CHECK: } -; ----- +; // ----- @glob.private = private constant [32 x i32] zeroinitializer @glob = linkonce_odr hidden alias [32 x i32], inttoptr (i64 add (i64 ptrtoint (ptr @glob.private to i64), i64 1234) to ptr) @@ -60,7 +60,7 @@ entry: ; CHECK: %[[RET:.*]] = llvm.inttoptr %[[INTTOPTR]] : i64 to !llvm.ptr ; CHECK: llvm.return %[[RET]] : !llvm.ptr -; ----- +; // ----- @g1 = private global i32 0 @g2 = internal constant ptr @a1 From a377cdd23db180b881f54fc7f88bf3aa85de21cc Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Mon, 17 Feb 2025 22:03:53 +0000 Subject: [PATCH 056/127] [lldb][TypeSystemClang] Add support for floating point template argument constants (#127206) This patch adds support for template arguments of `clang::TemplateArgument::ArgKind::StructuralValue` kind (added in https://github.com/llvm/llvm-project/pull/78041). These are used for non-type template parameters such as floating point constants. When LLDB created `clang::NonTypeTemplateParmDecl`s, it previously assumed integral values, this patch accounts for structural values too. Anywhere LLDB assumed a `DW_TAG_template_value_parameter` was `Integral`, it will now also check for `StructuralValue`, and will unpack the `TemplateArgument` value and type accordingly. We can rely on the fact that any `TemplateArgument` of `StructuralValue` kind that the `DWARFASTParserClang` creates will have a valid value, because it gets those from `DW_AT_const_value`. --- lldb/include/lldb/Symbol/CompilerType.h | 3 +- lldb/source/API/SBType.cpp | 4 +- .../Language/CPlusPlus/GenericBitset.cpp | 2 +- .../Plugins/Language/CPlusPlus/LibCxxSpan.cpp | 2 +- .../SymbolFile/DWARF/DWARFASTParserClang.cpp | 57 +++++++++++++------ .../TypeSystem/Clang/TypeSystemClang.cpp | 56 +++++++++++++----- .../TestCppTemplateArguments.py | 40 ++++++++++++- .../API/lang/cpp/template-arguments/main.cpp | 8 +++ lldb/unittests/Symbol/TestTypeSystemClang.cpp | 42 +++++++++++++- 9 files changed, 174 insertions(+), 40 deletions(-) diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h index 096a8f1ab68e8..fe4fcbccee370 100644 --- a/lldb/include/lldb/Symbol/CompilerType.h +++ b/lldb/include/lldb/Symbol/CompilerType.h @@ -14,6 +14,7 @@ #include #include +#include "lldb/Utility/Scalar.h" #include "lldb/lldb-private.h" #include "llvm/ADT/APSInt.h" #include "llvm/Support/Casting.h" @@ -544,7 +545,7 @@ bool operator==(const CompilerType &lhs, const CompilerType &rhs); bool operator!=(const CompilerType &lhs, const CompilerType &rhs); struct CompilerType::IntegralTemplateArgument { - llvm::APSInt value; + Scalar value; CompilerType type; }; diff --git a/lldb/source/API/SBType.cpp b/lldb/source/API/SBType.cpp index 6401d32c85795..9eb1f0c75ea05 100644 --- a/lldb/source/API/SBType.cpp +++ b/lldb/source/API/SBType.cpp @@ -697,6 +697,7 @@ lldb::SBValue SBType::GetTemplateArgumentValue(lldb::SBTarget target, std::optional arg; const bool expand_pack = true; switch (GetTemplateArgumentKind(idx)) { + case eTemplateArgumentKindStructuralValue: case eTemplateArgumentKindIntegral: arg = m_opaque_sp->GetCompilerType(false).GetIntegralTemplateArgument( idx, expand_pack); @@ -708,9 +709,8 @@ lldb::SBValue SBType::GetTemplateArgumentValue(lldb::SBTarget target, if (!arg) return {}; - Scalar value{arg->value}; DataExtractor data; - value.GetData(data); + arg->value.GetData(data); ExecutionContext exe_ctx; auto target_sp = target.GetSP(); diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp index f83f81fbdd1e7..934b456884ac0 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp @@ -90,7 +90,7 @@ lldb::ChildCacheState GenericBitsetFrontEnd::Update() { size_t size = 0; if (auto arg = m_backend.GetCompilerType().GetIntegralTemplateArgument(0)) - size = arg->value.getLimitedValue(); + size = arg->value.GetAPSInt().getLimitedValue(); m_elements.assign(size, ValueObjectSP()); m_first = diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp index ab3a5cf954ec7..21ee83041c065 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp @@ -117,7 +117,7 @@ lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() { } else if (auto arg = m_backend.GetCompilerType().GetIntegralTemplateArgument(1)) { - m_num_elements = arg->value.getLimitedValue(); + m_num_elements = arg->value.GetAPSInt().getLimitedValue(); } } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index ec0004c70c6da..2d4d22559963f 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -1973,6 +1973,33 @@ class DWARFASTParserClang::DelayedAddObjCClassProperty { ClangASTMetadata m_metadata; }; +static std::optional MakeAPValue(const clang::ASTContext &ast, + CompilerType clang_type, + uint64_t value) { + std::optional bit_width = clang_type.GetBitSize(nullptr); + if (!bit_width) + return std::nullopt; + + bool is_signed = false; + const bool is_integral = clang_type.IsIntegerOrEnumerationType(is_signed); + + llvm::APSInt apint(*bit_width, !is_signed); + apint = value; + + if (is_integral) + return clang::APValue(apint); + + uint32_t count; + bool is_complex; + // FIXME: we currently support a limited set of floating point types. + // E.g., 16-bit floats are not supported. + if (!clang_type.IsFloatingPointType(count, is_complex)) + return std::nullopt; + + return clang::APValue(llvm::APFloat( + ast.getFloatTypeSemantics(ClangUtil::GetQualType(clang_type)), apint)); +} + bool DWARFASTParserClang::ParseTemplateDIE( const DWARFDIE &die, TypeSystemClang::TemplateParameterInfos &template_param_infos) { @@ -2050,28 +2077,26 @@ bool DWARFASTParserClang::ParseTemplateDIE( clang_type = m_ast.GetBasicType(eBasicTypeVoid); if (!is_template_template_argument) { - bool is_signed = false; - // Get the signed value for any integer or enumeration if available - clang_type.IsIntegerOrEnumerationType(is_signed); if (name && !name[0]) name = nullptr; if (tag == DW_TAG_template_value_parameter && uval64_valid) { - std::optional size = clang_type.GetBitSize(nullptr); - if (!size) - return false; - llvm::APInt apint(*size, uval64, is_signed); - template_param_infos.InsertArg( - name, clang::TemplateArgument(ast, llvm::APSInt(apint, !is_signed), - ClangUtil::GetQualType(clang_type), - is_default_template_arg)); - } else { - template_param_infos.InsertArg( - name, clang::TemplateArgument(ClangUtil::GetQualType(clang_type), - /*isNullPtr*/ false, - is_default_template_arg)); + if (auto value = MakeAPValue(ast, clang_type, uval64)) { + template_param_infos.InsertArg( + name, clang::TemplateArgument( + ast, ClangUtil::GetQualType(clang_type), + std::move(*value), is_default_template_arg)); + return true; + } } + + // We get here if this is a type-template parameter or we couldn't create + // a non-type template parameter. + template_param_infos.InsertArg( + name, clang::TemplateArgument(ClangUtil::GetQualType(clang_type), + /*isNullPtr*/ false, + is_default_template_arg)); } else { auto *tplt_type = m_ast.CreateTemplateTemplateParmDecl(template_name); template_param_infos.InsertArg( diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index bcb63f719de10..1e0c7f0514941 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -1311,10 +1311,18 @@ CompilerType TypeSystemClang::CreateRecordType( } namespace { -/// Returns true iff the given TemplateArgument should be represented as an -/// NonTypeTemplateParmDecl in the AST. -bool IsValueParam(const clang::TemplateArgument &argument) { - return argument.getKind() == TemplateArgument::Integral; +/// Returns the type of the template argument iff the given TemplateArgument +/// should be represented as an NonTypeTemplateParmDecl in the AST. Returns +/// a null QualType otherwise. +QualType GetValueParamType(const clang::TemplateArgument &argument) { + switch (argument.getKind()) { + case TemplateArgument::Integral: + return argument.getIntegralType(); + case TemplateArgument::StructuralValue: + return argument.getStructuralValueType(); + default: + return {}; + } } void AddAccessSpecifierDecl(clang::CXXRecordDecl *cxx_record_decl, @@ -1361,8 +1369,8 @@ static TemplateParameterList *CreateTemplateParameterList( if (name && name[0]) identifier_info = &ast.Idents.get(name); TemplateArgument const &targ = args[i]; - if (IsValueParam(targ)) { - QualType template_param_type = targ.getIntegralType(); + QualType template_param_type = GetValueParamType(targ); + if (!template_param_type.isNull()) { template_param_decls.push_back(NonTypeTemplateParmDecl::Create( ast, decl_context, SourceLocation(), SourceLocation(), depth, i, identifier_info, template_param_type, parameter_pack, @@ -1380,10 +1388,11 @@ static TemplateParameterList *CreateTemplateParameterList( identifier_info = &ast.Idents.get(template_param_infos.GetPackName()); const bool parameter_pack_true = true; - if (!template_param_infos.GetParameterPack().IsEmpty() && - IsValueParam(template_param_infos.GetParameterPack().Front())) { - QualType template_param_type = - template_param_infos.GetParameterPack().Front().getIntegralType(); + QualType template_param_type = + !template_param_infos.GetParameterPack().IsEmpty() + ? GetValueParamType(template_param_infos.GetParameterPack().Front()) + : QualType(); + if (!template_param_type.isNull()) { template_param_decls.push_back(NonTypeTemplateParmDecl::Create( ast, decl_context, SourceLocation(), SourceLocation(), depth, num_template_params, identifier_info, template_param_type, @@ -1458,10 +1467,12 @@ static bool TemplateParameterAllowsValue(NamedDecl *param, } else if (auto *type_param = llvm::dyn_cast(param)) { // Compare the argument kind, i.e. ensure that != . - if (!IsValueParam(value)) + QualType value_param_type = GetValueParamType(value); + if (value_param_type.isNull()) return false; + // Compare the integral type, i.e. ensure that != . - if (type_param->getType() != value.getIntegralType()) + if (type_param->getType() != value_param_type) return false; } else { // There is no way to create other parameter decls at the moment, so we @@ -7351,10 +7362,27 @@ TypeSystemClang::GetIntegralTemplateArgument(lldb::opaque_compiler_type_t type, return std::nullopt; const auto *arg = GetNthTemplateArgument(template_decl, idx, expand_pack); - if (!arg || arg->getKind() != clang::TemplateArgument::Integral) + if (!arg) return std::nullopt; - return {{arg->getAsIntegral(), GetType(arg->getIntegralType())}}; + switch (arg->getKind()) { + case clang::TemplateArgument::Integral: + return {{arg->getAsIntegral(), GetType(arg->getIntegralType())}}; + case clang::TemplateArgument::StructuralValue: { + clang::APValue value = arg->getAsStructuralValue(); + CompilerType type = GetType(arg->getStructuralValueType()); + + if (value.isFloat()) + return {{value.getFloat(), type}}; + + if (value.isInt()) + return {{value.getInt(), type}}; + + return std::nullopt; + } + default: + return std::nullopt; + } } CompilerType TypeSystemClang::GetTypeForFormatters(void *type) { diff --git a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py index db5388b8bcc6d..eac7b5ef1099a 100644 --- a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py +++ b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py @@ -62,10 +62,44 @@ def test(self): self.assertEqual(template_param_value.GetTypeName(), "char") self.assertEqual(chr(template_param_value.GetValueAsSigned()), "v") - # FIXME: type should be Foo - # FIXME: double/float NTTP parameter values currently not supported. - value = self.expect_expr("temp4", result_type="Foo") + value = self.expect_expr("temp4", result_type="Foo") template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) self.assertEqual(template_param_value.GetTypeName(), "float") # FIXME: this should return a float self.assertEqual(template_param_value.GetValueAsSigned(), 2) + + value = self.expect_expr("temp5", result_type="Foo") + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertEqual(template_param_value.GetTypeName(), "double") + # FIXME: this should return a float + self.assertEqual(template_param_value.GetValueAsSigned(), -250) + + # FIXME: type should be Foo + value = self.expect_expr("temp6", result_type="Foo") + self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1)) + + # FIXME: support wider range of floating point types + value = self.expect_expr("temp7", result_type="Foo<__fp16, __fp16>") + self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1)) + + value = self.expect_expr("temp8", result_type="Foo<__fp16, __fp16>") + self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1)) + + value = self.expect_expr("temp9", result_type="Bar") + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertEqual(template_param_value.GetTypeName(), "double") + # FIXME: this should return a float + self.assertEqual(template_param_value.GetValueAsSigned(), 1) + + value = self.expect_expr( + "temp10", result_type="Bar" + ) + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertEqual(template_param_value.GetTypeName(), "float") + # FIXME: this should return a float + self.assertEqual(template_param_value.GetValueAsSigned(), 1) + + template_param_value = value.GetType().GetTemplateArgumentValue(target, 2) + self.assertEqual(template_param_value.GetTypeName(), "float") + # FIXME: this should return a float + self.assertEqual(template_param_value.GetValueAsSigned(), 2) diff --git a/lldb/test/API/lang/cpp/template-arguments/main.cpp b/lldb/test/API/lang/cpp/template-arguments/main.cpp index 0c0eb97cbc858..c08679aa0e166 100644 --- a/lldb/test/API/lang/cpp/template-arguments/main.cpp +++ b/lldb/test/API/lang/cpp/template-arguments/main.cpp @@ -9,5 +9,13 @@ template struct Foo {}; Foo temp2; Foo temp3; Foo temp4; +Foo temp5; +Foo temp6; +Foo<_Float16, _Float16(1.0)> temp7; +Foo<__bf16, __bf16(1.0)> temp8; + +template struct Bar {}; +Bar temp9; +Bar temp10; int main() {} diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp index 23374062127e0..a9b0c87c4fbce 100644 --- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp +++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp @@ -525,7 +525,17 @@ TEST_F(TestTypeSystemClang, TemplateArguments) { infos.InsertArg("I", TemplateArgument(m_ast->getASTContext(), arg, m_ast->getASTContext().IntTy)); - // template struct foo; + llvm::APFloat float_arg(5.5f); + infos.InsertArg("F", TemplateArgument(m_ast->getASTContext(), + m_ast->getASTContext().FloatTy, + clang::APValue(float_arg))); + + llvm::APFloat double_arg(-15.2); + infos.InsertArg("D", TemplateArgument(m_ast->getASTContext(), + m_ast->getASTContext().DoubleTy, + clang::APValue(double_arg))); + + // template struct foo; ClassTemplateDecl *decl = m_ast->CreateClassTemplateDecl( m_ast->GetTranslationUnitDecl(), OptionalClangModuleID(), eAccessPublic, "foo", llvm::to_underlying(clang::TagTypeKind::Struct), infos); @@ -555,6 +565,10 @@ TEST_F(TestTypeSystemClang, TemplateArguments) { CompilerType int_type(m_ast->weak_from_this(), m_ast->getASTContext().IntTy.getAsOpaquePtr()); + CompilerType float_type(m_ast->weak_from_this(), + m_ast->getASTContext().FloatTy.getAsOpaquePtr()); + CompilerType double_type(m_ast->weak_from_this(), + m_ast->getASTContext().DoubleTy.getAsOpaquePtr()); for (CompilerType t : {type, typedef_type, auto_type}) { SCOPED_TRACE(t.GetTypeName().AsCString()); @@ -577,8 +591,32 @@ TEST_F(TestTypeSystemClang, TemplateArguments) { auto result = m_ast->GetIntegralTemplateArgument(t.GetOpaqueQualType(), 1, expand_pack); ASSERT_NE(std::nullopt, result); - EXPECT_EQ(arg, result->value); + EXPECT_EQ(arg, result->value.GetAPSInt()); EXPECT_EQ(int_type, result->type); + + EXPECT_EQ( + m_ast->GetTemplateArgumentKind(t.GetOpaqueQualType(), 2, expand_pack), + eTemplateArgumentKindStructuralValue); + EXPECT_EQ( + m_ast->GetTypeTemplateArgument(t.GetOpaqueQualType(), 2, expand_pack), + CompilerType()); + auto float_result = m_ast->GetIntegralTemplateArgument( + t.GetOpaqueQualType(), 2, expand_pack); + ASSERT_NE(std::nullopt, float_result); + EXPECT_EQ(float_arg, float_result->value.GetAPFloat()); + EXPECT_EQ(float_type, float_result->type); + + EXPECT_EQ( + m_ast->GetTemplateArgumentKind(t.GetOpaqueQualType(), 3, expand_pack), + eTemplateArgumentKindStructuralValue); + EXPECT_EQ( + m_ast->GetTypeTemplateArgument(t.GetOpaqueQualType(), 3, expand_pack), + CompilerType()); + auto double_result = m_ast->GetIntegralTemplateArgument( + t.GetOpaqueQualType(), 3, expand_pack); + ASSERT_NE(std::nullopt, double_result); + EXPECT_EQ(double_arg, double_result->value.GetAPFloat()); + EXPECT_EQ(double_type, double_result->type); } } From 44cfb6b434a5f7d29fb48e10cf25e74a669a926d Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Mon, 17 Feb 2025 23:16:47 +0100 Subject: [PATCH 057/127] [SPIR-V] Ensure that a correct pointer type is deduced from the Value argument of OpAtomic* instructions (#127492) This PR improves the set of rules for type inference by ensuring that a correct pointer type is deduced from the Value argument of OpAtomic* instructions, also when a pointer argument is coming from an `inttoptr .. to` instruction that caused problems earlier. Existing test cases are updated accordingly. This fixes https://github.com/llvm/llvm-project/issues/127491 --- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 97 ++++++++++++------- .../atomicrmw_faddfsub_float.ll | 33 ++++++- .../SPIRV/transcoding/atomic_load_store.ll | 42 +++++++- 3 files changed, 129 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 251bc17fef52a..5dfba8427258f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -135,7 +135,7 @@ class SPIRVEmitIntrinsics // deduce Types of operands of the Instruction if possible void deduceOperandElementType(Instruction *I, - SmallPtrSet *UncompleteRets, + SmallPtrSet *IncompleteRets, const SmallPtrSet *AskOps = nullptr, bool IsPostprocessing = false); @@ -182,12 +182,12 @@ class SPIRVEmitIntrinsics bool deduceOperandElementTypeCalledFunction( CallInst *CI, SmallVector> &Ops, - Type *&KnownElemTy); + Type *&KnownElemTy, bool &Incomplete); void deduceOperandElementTypeFunctionPointer( CallInst *CI, SmallVector> &Ops, Type *&KnownElemTy, bool IsPostprocessing); bool deduceOperandElementTypeFunctionRet( - Instruction *I, SmallPtrSet *UncompleteRets, + Instruction *I, SmallPtrSet *IncompleteRets, const SmallPtrSet *AskOps, bool IsPostprocessing, Type *&KnownElemTy, Value *Op, Function *F); @@ -893,7 +893,7 @@ static inline Type *getAtomicElemTy(SPIRVGlobalRegistry *GR, Instruction *I, // indirect function invocation, and true otherwise. bool SPIRVEmitIntrinsics::deduceOperandElementTypeCalledFunction( CallInst *CI, SmallVector> &Ops, - Type *&KnownElemTy) { + Type *&KnownElemTy, bool &Incomplete) { Function *CalledF = CI->getCalledFunction(); if (!CalledF) return false; @@ -915,12 +915,15 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeCalledFunction( Ops.push_back(std::make_pair(Op, i)); } } else if (Grp == SPIRV::Atomic || Grp == SPIRV::AtomicFloating) { - if (CI->arg_size() < 2) + if (CI->arg_size() == 0) return true; Value *Op = CI->getArgOperand(0); if (!isPointerTy(Op->getType())) return true; switch (Opcode) { + case SPIRV::OpAtomicFAddEXT: + case SPIRV::OpAtomicFMinEXT: + case SPIRV::OpAtomicFMaxEXT: case SPIRV::OpAtomicLoad: case SPIRV::OpAtomicCompareExchangeWeak: case SPIRV::OpAtomicCompareExchange: @@ -934,9 +937,23 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeCalledFunction( case SPIRV::OpAtomicUMax: case SPIRV::OpAtomicSMin: case SPIRV::OpAtomicSMax: { - KnownElemTy = getAtomicElemTy(GR, CI, Op); + KnownElemTy = isPointerTy(CI->getType()) ? getAtomicElemTy(GR, CI, Op) + : CI->getType(); if (!KnownElemTy) return true; + Incomplete = isTodoType(Op); + Ops.push_back(std::make_pair(Op, 0)); + } break; + case SPIRV::OpAtomicStore: { + if (CI->arg_size() < 4) + return true; + Value *ValOp = CI->getArgOperand(3); + KnownElemTy = isPointerTy(ValOp->getType()) + ? getAtomicElemTy(GR, CI, Op) + : ValOp->getType(); + if (!KnownElemTy) + return true; + Incomplete = isTodoType(Op); Ops.push_back(std::make_pair(Op, 0)); } break; } @@ -954,7 +971,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionPointer( return; Ops.push_back(std::make_pair(Op, std::numeric_limits::max())); FunctionType *FTy = CI->getFunctionType(); - bool IsNewFTy = false, IsUncomplete = false; + bool IsNewFTy = false, IsIncomplete = false; SmallVector ArgTys; for (Value *Arg : CI->args()) { Type *ArgTy = Arg->getType(); @@ -963,9 +980,9 @@ void SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionPointer( IsNewFTy = true; ArgTy = getTypedPointerWrapper(ElemTy, getPointerAddressSpace(ArgTy)); if (isTodoType(Arg)) - IsUncomplete = true; + IsIncomplete = true; } else { - IsUncomplete = true; + IsIncomplete = true; } } ArgTys.push_back(ArgTy); @@ -977,19 +994,19 @@ void SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionPointer( RetTy = getTypedPointerWrapper(ElemTy, getPointerAddressSpace(CI->getType())); if (isTodoType(CI)) - IsUncomplete = true; + IsIncomplete = true; } else { - IsUncomplete = true; + IsIncomplete = true; } } - if (!IsPostprocessing && IsUncomplete) + if (!IsPostprocessing && IsIncomplete) insertTodoType(Op); KnownElemTy = IsNewFTy ? FunctionType::get(RetTy, ArgTys, FTy->isVarArg()) : FTy; } bool SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionRet( - Instruction *I, SmallPtrSet *UncompleteRets, + Instruction *I, SmallPtrSet *IncompleteRets, const SmallPtrSet *AskOps, bool IsPostprocessing, Type *&KnownElemTy, Value *Op, Function *F) { KnownElemTy = GR->findDeducedElementType(F); @@ -1018,13 +1035,13 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionRet( // This may happen just once per a function, the latch is a pair of // findDeducedElementType(F) / addDeducedElementType(F, ...). // With or without the latch it is a non-recursive call due to - // UncompleteRets set to nullptr in this call. - if (UncompleteRets) - for (Instruction *UncompleteRetI : *UncompleteRets) - deduceOperandElementType(UncompleteRetI, nullptr, AskOps, + // IncompleteRets set to nullptr in this call. + if (IncompleteRets) + for (Instruction *IncompleteRetI : *IncompleteRets) + deduceOperandElementType(IncompleteRetI, nullptr, AskOps, IsPostprocessing); - } else if (UncompleteRets) { - UncompleteRets->insert(I); + } else if (IncompleteRets) { + IncompleteRets->insert(I); } TypeValidated.insert(I); return true; @@ -1035,17 +1052,17 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionRet( // types which differ from expected, this function tries to insert a bitcast to // resolve the issue. void SPIRVEmitIntrinsics::deduceOperandElementType( - Instruction *I, SmallPtrSet *UncompleteRets, + Instruction *I, SmallPtrSet *IncompleteRets, const SmallPtrSet *AskOps, bool IsPostprocessing) { SmallVector> Ops; Type *KnownElemTy = nullptr; - bool Uncomplete = false; + bool Incomplete = false; // look for known basic patterns of type inference if (auto *Ref = dyn_cast(I)) { if (!isPointerTy(I->getType()) || !(KnownElemTy = GR->findDeducedElementType(I))) return; - Uncomplete = isTodoType(I); + Incomplete = isTodoType(I); for (unsigned i = 0; i < Ref->getNumIncomingValues(); i++) { Value *Op = Ref->getIncomingValue(i); if (isPointerTy(Op->getType())) @@ -1055,7 +1072,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( KnownElemTy = GR->findDeducedElementType(I); if (!KnownElemTy) return; - Uncomplete = isTodoType(I); + Incomplete = isTodoType(I); Ops.push_back(std::make_pair(Ref->getPointerOperand(), 0)); } else if (auto *Ref = dyn_cast(I)) { if (!isPointerTy(I->getType())) @@ -1063,7 +1080,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( KnownElemTy = GR->findDeducedElementType(I); if (!KnownElemTy) return; - Uncomplete = isTodoType(I); + Incomplete = isTodoType(I); Ops.push_back(std::make_pair(Ref->getOperand(0), 0)); } else if (auto *Ref = dyn_cast(I)) { if (GR->findDeducedElementType(Ref->getPointerOperand())) @@ -1090,22 +1107,28 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( Ops.push_back(std::make_pair(Ref->getPointerOperand(), StoreInst::getPointerOperandIndex())); } else if (auto *Ref = dyn_cast(I)) { - KnownElemTy = getAtomicElemTy(GR, I, Ref->getPointerOperand()); + KnownElemTy = isPointerTy(I->getType()) + ? getAtomicElemTy(GR, I, Ref->getPointerOperand()) + : I->getType(); if (!KnownElemTy) return; + Incomplete = isTodoType(Ref->getPointerOperand()); Ops.push_back(std::make_pair(Ref->getPointerOperand(), AtomicCmpXchgInst::getPointerOperandIndex())); } else if (auto *Ref = dyn_cast(I)) { - KnownElemTy = getAtomicElemTy(GR, I, Ref->getPointerOperand()); + KnownElemTy = isPointerTy(I->getType()) + ? getAtomicElemTy(GR, I, Ref->getPointerOperand()) + : I->getType(); if (!KnownElemTy) return; + Incomplete = isTodoType(Ref->getPointerOperand()); Ops.push_back(std::make_pair(Ref->getPointerOperand(), AtomicRMWInst::getPointerOperandIndex())); } else if (auto *Ref = dyn_cast(I)) { if (!isPointerTy(I->getType()) || !(KnownElemTy = GR->findDeducedElementType(I))) return; - Uncomplete = isTodoType(I); + Incomplete = isTodoType(I); for (unsigned i = 0; i < Ref->getNumOperands(); i++) { Value *Op = Ref->getOperand(i); if (isPointerTy(Op->getType())) @@ -1117,11 +1140,11 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( Value *Op = Ref->getReturnValue(); if (!Op) return; - if (deduceOperandElementTypeFunctionRet(I, UncompleteRets, AskOps, + if (deduceOperandElementTypeFunctionRet(I, IncompleteRets, AskOps, IsPostprocessing, KnownElemTy, Op, CurrF)) return; - Uncomplete = isTodoType(CurrF); + Incomplete = isTodoType(CurrF); Ops.push_back(std::make_pair(Op, 0)); } else if (auto *Ref = dyn_cast(I)) { if (!isPointerTy(Ref->getOperand(0)->getType())) @@ -1132,16 +1155,16 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( Type *ElemTy1 = GR->findDeducedElementType(Op1); if (ElemTy0) { KnownElemTy = ElemTy0; - Uncomplete = isTodoType(Op0); + Incomplete = isTodoType(Op0); Ops.push_back(std::make_pair(Op1, 1)); } else if (ElemTy1) { KnownElemTy = ElemTy1; - Uncomplete = isTodoType(Op1); + Incomplete = isTodoType(Op1); Ops.push_back(std::make_pair(Op0, 0)); } } else if (CallInst *CI = dyn_cast(I)) { if (!CI->isIndirectCall()) - deduceOperandElementTypeCalledFunction(CI, Ops, KnownElemTy); + deduceOperandElementTypeCalledFunction(CI, Ops, KnownElemTy, Incomplete); else if (HaveFunPtrs) deduceOperandElementTypeFunctionPointer(CI, Ops, KnownElemTy, IsPostprocessing); @@ -1175,7 +1198,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( Type *PrevElemTy = GR->findDeducedElementType(Op); GR->addDeducedElementType(Op, normalizeType(KnownElemTy)); // check if KnownElemTy is complete - if (!Uncomplete) + if (!Incomplete) eraseTodoType(Op); else if (!IsPostprocessing) insertTodoType(Op); @@ -2394,9 +2417,9 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { // Pass backward: use instructions results to specify/update/cast operands // where needed. - SmallPtrSet UncompleteRets; + SmallPtrSet IncompleteRets; for (auto &I : llvm::reverse(instructions(Func))) - deduceOperandElementType(&I, &UncompleteRets); + deduceOperandElementType(&I, &IncompleteRets); // Pass forward for PHIs only, their operands are not preceed the instruction // in meaning of `instructions(Func)`. @@ -2465,7 +2488,7 @@ bool SPIRVEmitIntrinsics::postprocessTypes(Module &M) { for (auto &F : M) { CurrF = &F; - SmallPtrSet UncompleteRets; + SmallPtrSet IncompleteRets; for (auto &I : llvm::reverse(instructions(F))) { auto It = ToProcess.find(&I); if (It == ToProcess.end()) @@ -2473,7 +2496,7 @@ bool SPIRVEmitIntrinsics::postprocessTypes(Module &M) { It->second.remove_if([this](Value *V) { return !isTodoType(V); }); if (It->second.size() == 0) continue; - deduceOperandElementType(&I, &UncompleteRets, &It->second, true); + deduceOperandElementType(&I, &IncompleteRets, &It->second, true); if (TodoTypeSz == 0) return true; } diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll index 075e63ea6de61..c6c8afc47dee3 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll @@ -1,6 +1,10 @@ ; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add %s -o - -filetype=obj | spirv-val %} ; CHECK-ERROR: LLVM ERROR: The atomic float instruction requires the following SPIR-V extension: SPV_EXT_shader_atomic_float_add @@ -25,9 +29,6 @@ ; CHECK: %[[Neg42:[0-9]+]] = OpFNegate %[[TyFP32]] %[[Const42]] ; CHECK: OpAtomicFAddEXT %[[TyFP32]] %[[DblPtr]] %[[ScopeWorkgroup]] %[[WorkgroupMemory]] %[[Neg42]] -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64" - @f = common dso_local local_unnamed_addr addrspace(1) global float 0.000000e+00, align 8 define dso_local spir_func void @test1() local_unnamed_addr { @@ -55,5 +56,31 @@ entry: declare spir_func float @_Z25atomic_fetch_add_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1), float, i32) declare spir_func float @_Z25atomic_fetch_sub_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1), float, i32) +; CHECK: %[[#Ptr1:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr1]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#Ptr2:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr2]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#Ptr3:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr3]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#Ptr4:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr4]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#Ptr5:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr5]] %[[#]] %[[#]] %[[#]] + +define dso_local spir_func void @test4(i64 noundef %arg, float %val) local_unnamed_addr { +entry: + %ptr1 = inttoptr i64 %arg to float addrspace(1)* + %v1 = atomicrmw fadd ptr addrspace(1) %ptr1, float %val seq_cst, align 4 + %ptr2 = inttoptr i64 %arg to float addrspace(1)* + %v2 = atomicrmw fsub ptr addrspace(1) %ptr2, float %val seq_cst, align 4 + %ptr3 = inttoptr i64 %arg to float addrspace(1)* + %v3 = tail call spir_func float @_Z21__spirv_AtomicFAddEXT(ptr addrspace(1) %ptr3, i32 1, i32 16, float %val) + %ptr4 = inttoptr i64 %arg to float addrspace(1)* + %v4 = tail call spir_func float @_Z25atomic_fetch_add_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1) %ptr4, float %val, i32 0) + %ptr5 = inttoptr i64 %arg to float addrspace(1)* + %v5 = tail call spir_func float @_Z25atomic_fetch_sub_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1) %ptr5, float %val, i32 0) + ret void +} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"wchar_size", i32 4} diff --git a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll index 3e5a3ac356936..17a915e33c973 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll @@ -1,6 +1,9 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + ;; Check 'LLVM ==> SPIR-V' conversion of atomic_load and atomic_store. ; CHECK-SPIRV-LABEL: OpFunction @@ -17,17 +20,50 @@ entry: ; CHECK-SPIRV-LABEL: OpFunction ; CHECK-SPIRV-NEXT: %[[#object:]] = OpFunctionParameter %[[#]] -; CHECK-SPIRV-NEXT: OpFunctionParameter ; CHECK-SPIRV-NEXT: %[[#desired:]] = OpFunctionParameter %[[#]] ; CHECK-SPIRV: OpAtomicStore %[[#object]] %[[#]] %[[#]] %[[#desired]] ; CHECK-SPIRV-LABEL: OpFunctionEnd -define spir_func void @test_store(i32 addrspace(4)* %object, i32 addrspace(4)* %expected, i32 %desired) { +define spir_func void @test_store(i32 addrspace(4)* %object, i32 %desired) { entry: call spir_func void @_Z12atomic_storePVU3AS4U7_Atomicii(i32 addrspace(4)* %object, i32 %desired) ret void } declare spir_func i32 @_Z11atomic_loadPVU3AS4U7_Atomici(i32 addrspace(4)*) - declare spir_func void @_Z12atomic_storePVU3AS4U7_Atomicii(i32 addrspace(4)*, i32) + +; The goal of @test_typesX() cases is to ensure that a correct pointer type +; is deduced from the Value argument of OpAtomicLoad/OpAtomicStore. There is +; no need to add more pattern matching rules to be sure that the pointer type +; is valid, it's enough that `spirv-val` considers the output valid as it +; checks the same condition while validating the output. + +define spir_func void @test_types1(ptr addrspace(1) %ptr, float %val) { +entry: + %r = call spir_func float @atomic_load(ptr addrspace(1) %ptr) + ret void +} + +define spir_func void @test_types2(ptr addrspace(1) %ptr, float %val) { +entry: + call spir_func void @atomic_store(ptr addrspace(1) %ptr, float %val) + ret void +} + +define spir_func void @test_types3(i64 noundef %arg, float %val) { +entry: + %ptr1 = inttoptr i64 %arg to float addrspace(1)* + %r = call spir_func float @atomic_load(ptr addrspace(1) %ptr1) + ret void +} + +define spir_func void @test_types4(i64 noundef %arg, float %val) { +entry: + %ptr2 = inttoptr i64 %arg to float addrspace(1)* + call spir_func void @atomic_store(ptr addrspace(1) %ptr2, float %val) + ret void +} + +declare spir_func float @atomic_load(ptr addrspace(1)) +declare spir_func void @atomic_store(ptr addrspace(1), float) From 798890ea10c4a111dff79d975534744f19c5a00d Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 17 Feb 2025 22:17:33 +0000 Subject: [PATCH 058/127] [AArch64] Remove unused HasFPAC. NFC It contains a syntax error, but us it is unused it can be removed until we need it. --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c9549f12769d1..93a6100ce54e9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -335,8 +335,6 @@ def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">; def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">; -def HasFPAC : Predicate<"Subtarget->hasFPAC())">, - AssemblerPredicateWithAll<(all_of FeatureFPAC), "fpac">; def HasXS : Predicate<"Subtarget->hasXS()">, AssemblerPredicateWithAll<(all_of FeatureXS), "xs">; def HasWFxT : Predicate<"Subtarget->hasWFxT()">, From 0b8bd472b0faf79005dfdd1078904fdf39879d61 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 17 Feb 2025 17:46:02 -0500 Subject: [PATCH 059/127] [OpenMP][libomp] Add OpenBSD, NetBSD and DragonFly stdarg handling (#126182) Fixes build on OpenBSD/aarch64. ``` FAILED: openmp/runtime/src/CMakeFiles/omp.dir/kmp_runtime.cpp.o /home/brad/tmp/llvm-build/bin/clang++ --target=aarch64-unknown-openbsd7.6 -D_DEBUG -D_GLIBCXX_ASSERTIONS -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -Domp_EXPORTS -I/home/brad/tmp/llvm-build/runtimes/runtimes-bins/openmp/runtime/src -I/home/brad/tmp/llvm-brad/openmp/runtime/src -I/home/brad/tmp/llvm-brad/openmp/runtime/src/i18n -I/home/brad/tmp/llvm-brad/openmp/runtime/src/include -I/home/brad/tmp/llvm-brad/openmp/runtime/src/thirdparty/ittnotify -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -Wall -fcolor-diagnostics -Wcast-qual -Wformat-pedantic -Wimplicit-fallthrough -Wsign-compare -Wno-extra -Wno-pedantic -fno-semantic-interposition -fdata-sections -O3 -DNDEBUG -std=c++17 -fPIC -D _GNU_SOURCE -D _REENTRANT -U_GLIBCXX_ASSERTIONS -UNDEBUG -fno-exceptions -fno-rtti -Wno-covered-switch-default -Wno-frame-address -Wno-strict-aliasing -Wno-switch -Wno-uninitialized -Wno-return-type-c-linkage -Wno-cast-qual -Wno-int-to-void-pointer-cast -MD -MT openmp/runtime/src/CMakeFiles/omp.dir/kmp_runtime.cpp.o -MF openmp/runtime/src/CMakeFiles/omp.dir/kmp_runtime.cpp.o.d -o openmp/runtime/src/CMakeFiles/omp.dir/kmp_runtime.cpp.o -c /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1449:47: error: value of type 'kmp_va_list' (aka '__builtin_va_list') is not contextually convertible to 'bool' 1449 | return (master_th->th.th_teams_microtask && ap && | ^~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1449:44: error: invalid operands to binary expression ('microtask_t' (aka 'void (*)(int *, int *, ...)') and 'kmp_va_list' (aka '__builtin_va_list')) 1449 | return (master_th->th.th_teams_microtask && ap && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1457:15: warning: comparison between NULL and non-pointer ('kmp_va_list' (aka '__builtin_va_list') and NULL) [-Wnull-arithmetic] 1457 | return ((ap == NULL && active_level == 0) || | ~~ ^ ~~~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1457:15: error: invalid operands to binary expression ('kmp_va_list' (aka '__builtin_va_list') and 'long') 1457 | return ((ap == NULL && active_level == 0) || | ~~ ^ ~~~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1458:12: error: value of type 'kmp_va_list' (aka '__builtin_va_list') is not contextually convertible to 'bool' 1458 | (ap && teams_level > 0 && teams_level == level)); | ^~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1458:15: error: invalid operands to binary expression ('kmp_va_list' (aka '__builtin_va_list') and 'bool') 1458 | (ap && teams_level > 0 && teams_level == level)); | ~~ ^ ~~~~~~~~~~~~~~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1735:9: error: invalid argument type 'kmp_va_list' (aka '__builtin_va_list') to unary expression 1735 | if (!ap) { | ^~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2169:66: warning: comparison between NULL and non-pointer ('kmp_va_list' (aka '__builtin_va_list') and NULL) [-Wnull-arithmetic] 2169 | !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) | ~~ ^ ~~~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2169:66: error: invalid operands to binary expression ('kmp_va_list' (aka '__builtin_va_list') and 'long') 2169 | !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) | ~~ ^ ~~~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2284:9: error: value of type 'kmp_va_list' (aka '__builtin_va_list') is not contextually convertible to 'bool' 2284 | if (ap) { | ^~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2302:58: error: invalid argument type 'kmp_va_list' (aka '__builtin_va_list') to unary expression 2302 | __kmp_fork_team_threads(root, team, master_th, gtid, !ap); | ^~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:2363:9: error: value of type 'kmp_va_list' (aka '__builtin_va_list') is not contextually convertible to 'bool' 2363 | if (ap) { | ^~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:7803:3: error: no matching function for call to '__kmp_fork_call' 7803 | __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, | ^~~~~~~~~~~~~~~ /home/brad/tmp/llvm-brad/openmp/runtime/src/kmp_runtime.cpp:1927:5: note: candidate function not viable: no known conversion from 'long' to 'kmp_va_list' (aka '__builtin_va_list') for 7th argument 1927 | int __kmp_fork_call(ident_t *loc, int gtid, | ^ 1928 | enum fork_context_e call_context, // Intel, GNU, ... 1929 | kmp_int32 argc, microtask_t microtask, launch_t invoker, 1930 | kmp_va_list ap) { | ~~~~~~~~~~~~~~ 2 warnings and 11 errors generated. ``` --- openmp/runtime/src/kmp_os.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index 2252f5e7e97a7..29a281f096855 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -219,7 +219,8 @@ typedef kmp_uint32 kmp_uint; // stdarg handling #if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 || KMP_ARCH_WASM) && \ - (KMP_OS_FREEBSD || KMP_OS_LINUX || KMP_OS_WASI) + (KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || KMP_OS_DRAGONFLY || \ + KMP_OS_LINUX || KMP_OS_WASI) typedef va_list *kmp_va_list; #define kmp_va_deref(ap) (*(ap)) #define kmp_va_addr_of(ap) (&(ap)) From a3dc77c00a012bb613cb08e669dab4fadf88e935 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 17 Feb 2025 15:44:41 -0800 Subject: [PATCH 060/127] [lldb] Support stepping through C++ thunks (#127419) This PR fixes LLDB stepping out, rather than stepping through a C++ thunk. The implementation is based on, and upstreams, the support for runtime thunks in the Swift fork. Fixes #43413 --- lldb/include/lldb/Target/LanguageRuntime.h | 2 + .../CPlusPlus/CPPLanguageRuntime.cpp | 11 ++++ .../CPlusPlus/CPPLanguageRuntime.h | 3 ++ .../Target/ThreadPlanShouldStopHere.cpp | 53 +++++++++++++++---- lldb/test/API/lang/cpp/thunk/Makefile | 3 ++ lldb/test/API/lang/cpp/thunk/TestThunk.py | 46 ++++++++++++++++ lldb/test/API/lang/cpp/thunk/main.cpp | 48 +++++++++++++++++ 7 files changed, 155 insertions(+), 11 deletions(-) create mode 100644 lldb/test/API/lang/cpp/thunk/Makefile create mode 100644 lldb/test/API/lang/cpp/thunk/TestThunk.py create mode 100644 lldb/test/API/lang/cpp/thunk/main.cpp diff --git a/lldb/include/lldb/Target/LanguageRuntime.h b/lldb/include/lldb/Target/LanguageRuntime.h index f9ae2dc589632..7e4c11df0da7f 100644 --- a/lldb/include/lldb/Target/LanguageRuntime.h +++ b/lldb/include/lldb/Target/LanguageRuntime.h @@ -201,6 +201,8 @@ class LanguageRuntime : public Runtime, public PluginInterface { return false; } + virtual bool IsSymbolARuntimeThunk(const Symbol &symbol) { return false; } + // Given the name of a runtime symbol (e.g. in Objective-C, an ivar offset // symbol), try to determine from the runtime what the value of that symbol // would be. Useful when the underlying binary is stripped. diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index 42fa54634841c..21a5ebe53073a 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -476,3 +476,14 @@ CPPLanguageRuntime::GetStepThroughTrampolinePlan(Thread &thread, return ret_plan_sp; } + +bool CPPLanguageRuntime::IsSymbolARuntimeThunk(const Symbol &symbol) { + llvm::StringRef mangled_name = + symbol.GetMangled().GetMangledName().GetStringRef(); + // Virtual function overriding from a non-virtual base use a "Th" prefix. + // Virtual function overriding from a virtual base must use a "Tv" prefix. + // Virtual function overriding thunks with covariant returns use a "Tc" + // prefix. + return mangled_name.starts_with("_ZTh") || mangled_name.starts_with("_ZTv") || + mangled_name.starts_with("_ZTc"); +} diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h index 57cfe28245808..05639e9798917 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h @@ -78,6 +78,9 @@ class CPPLanguageRuntime : public LanguageRuntime { bool stop_others) override; bool IsAllowedRuntimeValue(ConstString name) override; + + bool IsSymbolARuntimeThunk(const Symbol &symbol) override; + protected: // Classes that inherit from CPPLanguageRuntime can see and modify these CPPLanguageRuntime(Process *process); diff --git a/lldb/source/Target/ThreadPlanShouldStopHere.cpp b/lldb/source/Target/ThreadPlanShouldStopHere.cpp index e72f8d8f51a20..fa6bc08a9914d 100644 --- a/lldb/source/Target/ThreadPlanShouldStopHere.cpp +++ b/lldb/source/Target/ThreadPlanShouldStopHere.cpp @@ -8,6 +8,7 @@ #include "lldb/Target/ThreadPlanShouldStopHere.h" #include "lldb/Symbol/Symbol.h" +#include "lldb/Target/LanguageRuntime.h" #include "lldb/Target/RegisterContext.h" #include "lldb/Target/Thread.h" #include "lldb/Utility/LLDBLog.h" @@ -76,6 +77,19 @@ bool ThreadPlanShouldStopHere::DefaultShouldStopHereCallback( } } + // Check whether the frame we are in is a language runtime thunk, only for + // step out: + if (operation == eFrameCompareOlder) { + if (Symbol *symbol = frame->GetSymbolContext(eSymbolContextSymbol).symbol) { + ProcessSP process_sp(current_plan->GetThread().GetProcess()); + for (auto *runtime : process_sp->GetLanguageRuntimes()) { + if (runtime->IsSymbolARuntimeThunk(*symbol)) { + should_stop_here = false; + break; + } + } + } + } // Always avoid code with line number 0. // FIXME: At present the ShouldStop and the StepFromHere calculate this // independently. If this ever @@ -109,18 +123,35 @@ ThreadPlanSP ThreadPlanShouldStopHere::DefaultStepFromHereCallback( if (sc.line_entry.line == 0) { AddressRange range = sc.line_entry.range; - - // If the whole function is marked line 0 just step out, that's easier & - // faster than continuing to step through it. bool just_step_out = false; - if (sc.symbol && sc.symbol->ValueIsAddress()) { - Address symbol_end = sc.symbol->GetAddress(); - symbol_end.Slide(sc.symbol->GetByteSize() - 1); - if (range.ContainsFileAddress(sc.symbol->GetAddress()) && - range.ContainsFileAddress(symbol_end)) { - LLDB_LOGF(log, "Stopped in a function with only line 0 lines, just " - "stepping out."); - just_step_out = true; + if (sc.symbol) { + ProcessSP process_sp(current_plan->GetThread().GetProcess()); + + // If this is a runtime thunk, step through it, rather than stepping out + // because it's marked line 0. + bool is_thunk = false; + for (auto *runtime : process_sp->GetLanguageRuntimes()) { + if (runtime->IsSymbolARuntimeThunk(*sc.symbol)) { + LLDB_LOGF(log, "In runtime thunk %s - stepping out.", + sc.symbol->GetName().GetCString()); + is_thunk = true; + break; + } + } + + // If the whole function is marked line 0 just step out, that's easier & + // faster than continuing to step through it. + // FIXME: This assumes that the function is a single line range. It could + // be a series of contiguous line 0 ranges. Check for that too. + if (!is_thunk && sc.symbol->ValueIsAddress()) { + Address symbol_end = sc.symbol->GetAddress(); + symbol_end.Slide(sc.symbol->GetByteSize() - 1); + if (range.ContainsFileAddress(sc.symbol->GetAddress()) && + range.ContainsFileAddress(symbol_end)) { + LLDB_LOGF(log, "Stopped in a function with only line 0 lines, just " + "stepping out."); + just_step_out = true; + } } } if (!just_step_out) { diff --git a/lldb/test/API/lang/cpp/thunk/Makefile b/lldb/test/API/lang/cpp/thunk/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/lang/cpp/thunk/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/lang/cpp/thunk/TestThunk.py b/lldb/test/API/lang/cpp/thunk/TestThunk.py new file mode 100644 index 0000000000000..ceb16263648d6 --- /dev/null +++ b/lldb/test/API/lang/cpp/thunk/TestThunk.py @@ -0,0 +1,46 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class ThunkTest(TestBase): + def test_step_through_thunk(self): + self.build() + lldbutil.run_to_name_breakpoint(self, "testit") + + # Make sure we step through the thunk into Derived1::doit + self.expect( + "step", + STEP_IN_SUCCEEDED, + substrs=["stop reason = step in", "Derived1::doit"], + ) + + self.runCmd("continue") + + self.expect( + "step", + STEP_IN_SUCCEEDED, + substrs=["stop reason = step in", "Derived2::doit"], + ) + + def test_step_out_thunk(self): + self.build() + lldbutil.run_to_name_breakpoint(self, "testit_debug") + + # Make sure we step out of the thunk and end up in testit_debug. + source = "main.cpp" + line = line_number(source, "// Step here") + self.expect( + "step", + STEP_IN_SUCCEEDED, + substrs=["stop reason = step in", "{}:{}".format(source, line)], + ) + + self.runCmd("continue") + + self.expect( + "step", + STEP_IN_SUCCEEDED, + substrs=["stop reason = step in", "Derived2::doit_debug"], + ) diff --git a/lldb/test/API/lang/cpp/thunk/main.cpp b/lldb/test/API/lang/cpp/thunk/main.cpp new file mode 100644 index 0000000000000..82d17b1350093 --- /dev/null +++ b/lldb/test/API/lang/cpp/thunk/main.cpp @@ -0,0 +1,48 @@ +#include + +class Base1 { +public: + virtual ~Base1() {} +}; + +class Base2 { +public: + virtual void doit() = 0; + virtual void doit_debug() = 0; +}; + +Base2 *b; + +class Derived1 : public Base1, public Base2 { +public: + virtual void doit() { printf("Derived1\n"); } + virtual void __attribute__((nodebug)) doit_debug() { + printf("Derived1 (no debug)\n"); + } +}; + +class Derived2 : public Base2 { +public: + virtual void doit() { printf("Derived2\n"); } + virtual void doit_debug() { printf("Derived2 (debug)\n"); } +}; + +void testit() { b->doit(); } + +void testit_debug() { + b->doit_debug(); + printf("This is where I should step out to with nodebug.\n"); // Step here +} + +int main() { + + b = new Derived1(); + testit(); + testit_debug(); + + b = new Derived2(); + testit(); + testit_debug(); + + return 0; +} From 2b41277a09820bc47dc533ad37b5213edc2e8d52 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 17 Feb 2025 16:25:09 -0800 Subject: [PATCH 061/127] [lldb] Disable test_step_out_thunk on Windows On Windows we end up in assembly. Not sure if the thread plans behave differently or this is a debug info issue. I have no environment to reproduce and investigate this in, so I'm disabling the test for now. --- lldb/test/API/lang/cpp/thunk/TestThunk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/lang/cpp/thunk/TestThunk.py b/lldb/test/API/lang/cpp/thunk/TestThunk.py index ceb16263648d6..9370c1c58c18b 100644 --- a/lldb/test/API/lang/cpp/thunk/TestThunk.py +++ b/lldb/test/API/lang/cpp/thunk/TestThunk.py @@ -24,6 +24,7 @@ def test_step_through_thunk(self): substrs=["stop reason = step in", "Derived2::doit"], ) + @skipIfWindows def test_step_out_thunk(self): self.build() lldbutil.run_to_name_breakpoint(self, "testit_debug") From 8fe290efa634c449937d0576e391555d0ebb6efb Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 17 Feb 2025 17:06:59 -0800 Subject: [PATCH 062/127] [libc] Canonicalize generated fenv.h (#127363) This removes the custom template for fenv.h by declaring all the standard-specified macros using macro_header. --- libc/include/fenv.h.def | 17 ----------------- libc/include/fenv.yaml | 35 ++++++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 24 deletions(-) delete mode 100644 libc/include/fenv.h.def diff --git a/libc/include/fenv.h.def b/libc/include/fenv.h.def deleted file mode 100644 index c677b2a5930dc..0000000000000 --- a/libc/include/fenv.h.def +++ /dev/null @@ -1,17 +0,0 @@ -//===-- C standard library header fenv.h ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_FENV_H -#define LLVM_LIBC_FENV_H - -#include "__llvm-libc-common.h" -#include "llvm-libc-macros/fenv-macros.h" - -%%public_api() - -#endif // LLVM_LIBC_FENV_H diff --git a/libc/include/fenv.yaml b/libc/include/fenv.yaml index 1ecaf63085504..c7cc7e87df37f 100644 --- a/libc/include/fenv.yaml +++ b/libc/include/fenv.yaml @@ -1,11 +1,32 @@ header: fenv.h -header_template: fenv.h.def -macros: [] +standards: + - stdc +macros: + - macro_name: FE_ALL_EXCEPT + macro_header: fenv-macros.h + - macro_name: FE_DIVBYZERO + macro_header: fenv-macros.h + - macro_name: FE_INEXACT + macro_header: fenv-macros.h + - macro_name: FE_INVALID + macro_header: fenv-macros.h + - macro_name: FE_OVERFLOW + macro_header: fenv-macros.h + - macro_name: FE_UNDERFLOW + macro_header: fenv-macros.h + - macro_name: FE_DOWNWARD + macro_header: fenv-macros.h + - macro_name: FE_TONEAREST + macro_header: fenv-macros.h + - macro_name: FE_TOWARDZERO + macro_header: fenv-macros.h + - macro_name: FE_UPWARD + macro_header: fenv-macros.h + - macro_name: FE_DFL_ENV + macro_header: fenv-macros.h types: - type_name: fenv_t - type_name: fexcept_t -enums: [] -objects: [] functions: - name: feclearexcept standards: @@ -15,14 +36,14 @@ functions: - type: int - name: fedisableexcept standards: - - GNUExtensions + - gnu return_type: int arguments: - type: int guard: null - name: feenableexcept standards: - - GNUExtensions + - gnu return_type: int arguments: - type: int @@ -35,7 +56,7 @@ functions: - type: fenv_t * - name: fegetexcept standards: - - GNUExtensions + - gnu return_type: int arguments: [] - name: fegetexceptflag From ed38d6702f7695092c9486016e2504f8c6bfef37 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 08:07:29 +0700 Subject: [PATCH 063/127] PeepholeOpt: Handle subregister compose when looking through reg_sequence (#127051) Previously this would give up on folding subregister copies through a reg_sequence if the input operand already had a subregister index. d246cc618adc52fdbd69d44a2a375c8af97b6106 stopped introducing these subregister uses, and this is the first step to lifting that restriction. I was expecting to be able to implement this only purely with compose / reverse compose, but I wasn't able to make it work so relies on testing the lanemasks for whether the copy reads a subset of the input. --- llvm/lib/CodeGen/PeepholeOptimizer.cpp | 33 +++++++++- .../AMDGPU/GlobalISel/extractelement.ll | 10 ++-- ...e92561-restore-undef-scc-verifier-error.ll | 60 ++++++++++--------- .../peephole-opt-fold-reg-sequence-subreg.mir | 52 ++++++++-------- 4 files changed, 94 insertions(+), 61 deletions(-) diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 745c0d4b36a62..24bd9938bc45c 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1984,12 +1984,43 @@ ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() { // We are looking at: // Def = REG_SEQUENCE v0, sub0, v1, sub1, ... - // Check if one of the operand defines the subreg we are interested in. + // + // Check if one of the operands exactly defines the subreg we are interested + // in. for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) { if (RegSeqInput.SubIdx == DefSubReg) return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg); } + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + + // If we did not find an exact match, see if we can do a composition to + // extract a sub-subregister. + for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) { + // We don't check if the resulting class supports the subregister index + // yet. This will occur before any rewrite when looking for an eligible + // source. + + LaneBitmask DefMask = TRI->getSubRegIndexLaneMask(DefSubReg); + LaneBitmask ThisOpRegMask = TRI->getSubRegIndexLaneMask(RegSeqInput.SubIdx); + + // Check that this extract reads a subset of this single reg_sequence input. + // + // FIXME: We should be able to filter this in terms of the indexes directly + // without checking the lanemasks. + if ((DefMask & ThisOpRegMask) != DefMask) + continue; + + unsigned ReverseDefCompose = + TRI->reverseComposeSubRegIndices(RegSeqInput.SubIdx, DefSubReg); + if (!ReverseDefCompose) + continue; + + unsigned ComposedDefInSrcReg1 = + TRI->composeSubRegIndices(RegSeqInput.SubReg, ReverseDefCompose); + return ValueTrackerResult(RegSeqInput.Reg, ComposedDefInSrcReg1); + } + // If the subreg we are tracking is super-defined by another subreg, // we could follow this value. However, this would require to compose // the subreg and we do not do that for now. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index f2a4332bcb8ba..c136028f2de43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2872,8 +2872,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: dyn_extract_v7f64_v_v: @@ -2898,8 +2898,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: dyn_extract_v7f64_v_v: @@ -2918,7 +2918,7 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v0 :: v_dual_cndmask_b32 v1, v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x double> %vec, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll index 3eb9d474ec030..f961e857f39e5 100644 --- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll +++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll @@ -79,9 +79,9 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_clause 0x1 -; GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off -; GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off +; GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 +; GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GISEL-NEXT: s_mov_b32 s20, 0 ; GISEL-NEXT: s_mov_b32 s3, exec_lo ; GISEL-NEXT: s_mov_b32 s21, s20 @@ -97,19 +97,19 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL-NEXT: s_mov_b32 s11, s20 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s12, v2 -; GISEL-NEXT: v_readfirstlane_b32 s13, v3 -; GISEL-NEXT: v_readfirstlane_b32 s14, v4 -; GISEL-NEXT: v_readfirstlane_b32 s15, v5 -; GISEL-NEXT: v_readfirstlane_b32 s16, v6 -; GISEL-NEXT: v_readfirstlane_b32 s17, v7 -; GISEL-NEXT: v_readfirstlane_b32 s18, v8 -; GISEL-NEXT: v_readfirstlane_b32 s19, v9 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[2:3] -; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[4:5] -; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[6:7] +; GISEL-NEXT: v_readfirstlane_b32 s12, v4 +; GISEL-NEXT: v_readfirstlane_b32 s13, v5 +; GISEL-NEXT: v_readfirstlane_b32 s14, v6 +; GISEL-NEXT: v_readfirstlane_b32 s15, v7 +; GISEL-NEXT: v_readfirstlane_b32 s16, v0 +; GISEL-NEXT: v_readfirstlane_b32 s17, v1 +; GISEL-NEXT: v_readfirstlane_b32 s18, v2 +; GISEL-NEXT: v_readfirstlane_b32 s19, v3 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[6:7] +; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[8:9] +; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[2:3] ; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: s_and_b32 s0, s0, s1 @@ -117,29 +117,31 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL-NEXT: s_and_b32 s0, s0, s2 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_and_saveexec_b32 s0, s0 -; GISEL-NEXT: image_sample_c_lz v1, [v0, v0, v0, v0], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GISEL-NEXT: s_cbranch_execnz .LBB0_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b32 exec_lo, s3 -; GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x7fc00000 +; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_clause 0x2 -; GISEL-NEXT: image_sample_c_lz v0, [v2, v2, v0, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; GISEL-NEXT: image_sample_c_lz v3, [v2, v3, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; GISEL-NEXT: image_sample_c_lz v4, [v2, v2, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: image_sample_c_lz v2, [v1, v2, v1, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GISEL-NEXT: s_waitcnt vmcnt(2) -; GISEL-NEXT: v_add_f32_e32 v0, v1, v0 +; GISEL-NEXT: v_add_f32_e32 v0, v9, v0 ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v2 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_add_f32_e32 v0, v2, v0 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_add_f32_e32 v0, v4, v0 +; GISEL-NEXT: v_add_f32_e32 v0, v3, v0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_mul_f32_e32 v1, 0x3e800000, v0 -; GISEL-NEXT: image_store v[1:3], [v2, v2], s[4:11] dim:SQ_RSRC_IMG_2D unorm +; GISEL-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0 +; GISEL-NEXT: image_store v[0:2], [v1, v1], s[4:11] dim:SQ_RSRC_IMG_2D unorm ; GISEL-NEXT: s_setpc_b64 s[30:31] bb: %descriptor = load <8 x i32>, ptr addrspace(1) %arg, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir index ea8e2edb80c7e..6d2f4e76840ae 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir @@ -162,7 +162,7 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY4]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -189,7 +189,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub0_sub1, %subreg.sub0, [[COPY1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 @@ -212,7 +212,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]].sub0_sub1, %subreg.sub0, [[COPY1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 @@ -285,7 +285,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1, [[V_MOV_B32_e32_]], %subreg.sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_64 = COPY $vgpr1_vgpr2 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -311,8 +311,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 @@ -340,7 +340,7 @@ body: | ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -367,8 +367,8 @@ body: | ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]], implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -420,7 +420,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -444,7 +444,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -468,7 +468,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -492,7 +492,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -516,7 +516,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -540,7 +540,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]].sub0_sub1, %subreg.sub2_sub3, [[COPY]].sub2_sub3, %subreg.sub0_sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -564,7 +564,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]].sub0_sub1, %subreg.sub2_sub3, [[COPY]].sub2_sub3, %subreg.sub0_sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -588,7 +588,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub1_sub2_sub3, %subreg.sub0_sub1_sub2, [[COPY1]].sub1, %subreg.sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_64 = COPY $vgpr5_vgpr6 @@ -615,9 +615,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub1_sub2_sub3, %subreg.sub0_sub1_sub2, [[COPY1]].sub1, %subreg.sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]], implicit [[COPY5]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 @@ -650,12 +650,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub4_sub5, [[COPY]].sub1_sub2, %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub4 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_64 = COPY $vgpr5_vgpr6 From b5b8a59a530b69f02bfc98b1ab8758e1757ddb8f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 08:22:45 +0700 Subject: [PATCH 064/127] AMDGPU: Implement getRequiredProperties for SIFoldOperands (#127522) Fix the broken MIR tests violating isSSA. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 5 +++ llvm/lib/Target/AMDGPU/SIFoldOperands.h | 5 +++ .../AMDGPU/constant-fold-imm-immreg.mir | 41 ++++++++++--------- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index d8f3f9c54abc1..999553bfaff38 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -166,6 +166,11 @@ class SIFoldOperandsLegacy : public MachineFunctionPass { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } }; } // End anonymous namespace. diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.h b/llvm/lib/Target/AMDGPU/SIFoldOperands.h index d6b8f6a729526..c419ec0911e20 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.h +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.h @@ -17,6 +17,11 @@ class SIFoldOperandsPass : public PassInfoMixin { SIFoldOperandsPass() = default; PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); + + MachineFunctionProperties getRequiredProperties() const { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } }; } // namespace llvm diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 3db2b6ed9ab4b..39b5076ebe5ac 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -96,8 +96,8 @@ body: | %12:vgpr_32 = V_AND_B32_e64 %8, %8, implicit $exec FLAT_STORE_DWORD %19, %12, 0, 0, implicit $exec, implicit $flat_scr - %13:vgpr_32 = V_AND_B32_e64 %16, %16, implicit $exec - FLAT_STORE_DWORD %19, %13, 0, 0, implicit $exec, implicit $flat_scr + %21:vgpr_32 = V_AND_B32_e64 %16, %16, implicit $exec + FLAT_STORE_DWORD %19, %21, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 @@ -191,6 +191,7 @@ body: | name: v_fold_ashr_imm_regimm_32 tracksRegLiveness: true +isSSA: true liveins: - { reg: '$sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '$vgpr0', virtual-reg: '%2' } @@ -232,8 +233,8 @@ body: | %14:vgpr_32 = V_ASHR_I32_e64 7, %29, implicit $exec FLAT_STORE_DWORD %20, %14, 0, 0, implicit $exec, implicit $flat_scr - %15:vgpr_32 = V_ASHR_I32_e64 %27, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, implicit $exec, implicit $flat_scr + %33:vgpr_32 = V_ASHR_I32_e64 %27, %24, implicit $exec + FLAT_STORE_DWORD %20, %33, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_ASHR_I32_e64 %6, 4, implicit $exec FLAT_STORE_DWORD %20, %22, 0, 0, implicit $exec, implicit $flat_scr @@ -356,8 +357,8 @@ body: | %14:vgpr_32 = V_LSHR_B32_e64 7, %29, implicit $exec FLAT_STORE_DWORD %20, %14, 0, 0, implicit $exec, implicit $flat_scr - %15:vgpr_32 = V_LSHR_B32_e64 %27, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, implicit $exec, implicit $flat_scr + %33:vgpr_32 = V_LSHR_B32_e64 %27, %24, implicit $exec + FLAT_STORE_DWORD %20, %33, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_LSHR_B32_e64 %6, 4, implicit $exec FLAT_STORE_DWORD %20, %22, 0, 0, implicit $exec, implicit $flat_scr @@ -497,8 +498,8 @@ body: | # GCN: %17:vgpr_32 = V_MOV_B32_e32 1234567, implicit $exec # GCN: FLAT_STORE_DWORD %10, %17, -# GCN: %3:vgpr_32 = V_MOV_B32_e32 63, implicit $exec -# GCN: FLAT_STORE_DWORD %10, %3, +# GCN: %18:vgpr_32 = V_MOV_B32_e32 63, implicit $exec +# GCN: FLAT_STORE_DWORD %10, %18, name: v_fold_or_imm_regimm_32 alignment: 0 @@ -536,8 +537,8 @@ body: | FLAT_STORE_DWORD %19, %11, 0, 0, implicit $exec, implicit $flat_scr %12:vgpr_32 = V_OR_B32_e64 %8, %8, implicit $exec FLAT_STORE_DWORD %19, %12, 0, 0, implicit $exec, implicit $flat_scr - %13:vgpr_32 = V_OR_B32_e64 %16, %16, implicit $exec - FLAT_STORE_DWORD %19, %13, 0, 0, implicit $exec, implicit $flat_scr + %21:vgpr_32 = V_OR_B32_e64 %16, %16, implicit $exec + FLAT_STORE_DWORD %19, %21, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -689,24 +690,24 @@ body: | # GCN: %19:vgpr_32 = V_MOV_B32_e32 24, implicit $exec # GCN: FLAT_STORE_DWORD %10, %19, -# GCN: %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec -# GCN: FLAT_STORE_DWORD %10, %3, - -# GCN: %20:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec +# GCN: %20:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # GCN: FLAT_STORE_DWORD %10, %20, -# GCN: %21:vgpr_32 = V_MOV_B32_e32 1, implicit $exec +# GCN: %21:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec # GCN: FLAT_STORE_DWORD %10, %21, -# GCN: %22:vgpr_32 = V_MOV_B32_e32 2, implicit $exec +# GCN: %22:vgpr_32 = V_MOV_B32_e32 1, implicit $exec # GCN: FLAT_STORE_DWORD %10, %22, -# GCN: %23:vgpr_32 = V_MOV_B32_e32 7927808, implicit $exec +# GCN: %23:vgpr_32 = V_MOV_B32_e32 2, implicit $exec # GCN: FLAT_STORE_DWORD %10, %23, -# GCN: %24:vgpr_32 = V_MOV_B32_e32 -8, implicit $exec +# GCN: %24:vgpr_32 = V_MOV_B32_e32 7927808, implicit $exec # GCN: FLAT_STORE_DWORD %10, %24, +# GCN: %25:vgpr_32 = V_MOV_B32_e32 -8, implicit $exec +# GCN: FLAT_STORE_DWORD %10, %25, + name: v_fold_shl_imm_regimm_32 alignment: 0 exposesReturnsTwice: false @@ -745,8 +746,8 @@ body: | FLAT_STORE_DWORD %20, %13, 0, 0, implicit $exec, implicit $flat_scr %14:vgpr_32 = V_LSHL_B32_e64 12, %7, implicit $exec FLAT_STORE_DWORD %20, %14, 0, 0, implicit $exec, implicit $flat_scr - %15:vgpr_32 = V_LSHL_B32_e64 12, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, implicit $exec, implicit $flat_scr + %30:vgpr_32 = V_LSHL_B32_e64 12, %24, implicit $exec + FLAT_STORE_DWORD %20, %30, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_LSHL_B32_e64 %6, 12, implicit $exec FLAT_STORE_DWORD %20, %22, 0, 0, implicit $exec, implicit $flat_scr %23:vgpr_32 = V_LSHL_B32_e64 %6, 32, implicit $exec From 09d14149f61d1f723ed39ce5297c572d53eb7c44 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 18 Feb 2025 02:41:25 +0100 Subject: [PATCH 065/127] [clang][bytecode] Fix return value of array CXXNewExprs (#127526) Just like with the __builtin_operator_new version, we need to point to the first array element, not the array element itself. --- clang/lib/AST/ByteCode/Interp.cpp | 3 ++- clang/lib/AST/ByteCode/Interp.h | 13 +++++++++++-- clang/test/AST/ByteCode/new-delete.cpp | 14 ++++++++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index c80be094856b0..0310870f7372e 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1063,7 +1063,8 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm, return false; } - if (!Ptr.isRoot() || Ptr.isOnePastEnd() || Ptr.isArrayElement()) { + if (!Ptr.isRoot() || Ptr.isOnePastEnd() || + (Ptr.isArrayElement() && Ptr.getIndex() != 0)) { const SourceInfo &Loc = S.Current->getSource(OpPC); S.FFDiag(Loc, diag::note_constexpr_delete_subobject) << Ptr.toDiagnosticString(S.getASTContext()) << Ptr.isOnePastEnd(); diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 73cc107b7dbff..10cf21e28437c 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -2915,13 +2915,17 @@ inline bool AllocN(InterpState &S, CodePtr OpPC, PrimType T, const Expr *Source, S.Stk.push(0, nullptr); return true; } + assert(NumElements.isPositive()); DynamicAllocator &Allocator = S.getAllocator(); Block *B = Allocator.allocate(Source, T, static_cast(NumElements), S.Ctx.getEvalID(), DynamicAllocator::Form::Array); assert(B); - S.Stk.push(B); + if (NumElements.isZero()) + S.Stk.push(B); + else + S.Stk.push(Pointer(B).atIndex(0)); return true; } @@ -2941,13 +2945,18 @@ inline bool AllocCN(InterpState &S, CodePtr OpPC, const Descriptor *ElementDesc, S.Stk.push(0, ElementDesc); return true; } + assert(NumElements.isPositive()); DynamicAllocator &Allocator = S.getAllocator(); Block *B = Allocator.allocate(ElementDesc, static_cast(NumElements), S.Ctx.getEvalID(), DynamicAllocator::Form::Array); assert(B); - S.Stk.push(B); + if (NumElements.isZero()) + S.Stk.push(B); + else + S.Stk.push(Pointer(B).atIndex(0)); + return true; } diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index e9850d27666e5..7e5f6ab8815ea 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -922,6 +922,20 @@ namespace NonConstexprArrayCtor { // both-note {{in call to}} } +namespace ArrayBaseCast { + struct A {}; + struct B : A {}; + constexpr bool test() { + B *b = new B[2]; + + A* a = b; + + delete[] b; + return true; + } + static_assert(test()); +} + #else /// Make sure we reject this prior to C++20 constexpr int a() { // both-error {{never produces a constant expression}} From 51c91095ab5b1f8f0d65e6fbd551e8b991866b3f Mon Sep 17 00:00:00 2001 From: Vy Nguyen Date: Mon, 17 Feb 2025 20:59:12 -0500 Subject: [PATCH 066/127] Reapply "Make llvm::telemetry::Manager::preDispatch protected. (#127114) (#127431) This reverts commit 66465c3b0ab1b32403ad5a1c3114174d87830f54. New change: added missing return statement. --- llvm/include/llvm/Telemetry/Telemetry.h | 9 +++++---- llvm/lib/Telemetry/Telemetry.cpp | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h index 344a49df5cbf0..42319f3ef51f2 100644 --- a/llvm/include/llvm/Telemetry/Telemetry.h +++ b/llvm/include/llvm/Telemetry/Telemetry.h @@ -138,10 +138,6 @@ class Manager { public: virtual ~Manager() = default; - // Optional callback for subclasses to perform additional tasks before - // dispatching to Destinations. - virtual Error preDispatch(TelemetryInfo *Entry) = 0; - // Dispatch Telemetry data to the Destination(s). // The argument is non-const because the Manager may add or remove // data from the entry. @@ -150,6 +146,11 @@ class Manager { // Register a Destination. void addDestination(std::unique_ptr Destination); +protected: + // Optional callback for subclasses to perform additional tasks before + // dispatching to Destinations. + virtual Error preDispatch(TelemetryInfo *Entry); + private: std::vector> Destinations; }; diff --git a/llvm/lib/Telemetry/Telemetry.cpp b/llvm/lib/Telemetry/Telemetry.cpp index 9e13d08334e3b..d86ad9c1c37bb 100644 --- a/llvm/lib/Telemetry/Telemetry.cpp +++ b/llvm/lib/Telemetry/Telemetry.cpp @@ -1,3 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides the basic framework for Telemetry. +/// Refer to its documentation at llvm/docs/Telemetry.rst for more details. +//===---------------------------------------------------------------------===// + #include "llvm/Telemetry/Telemetry.h" namespace llvm { @@ -22,5 +35,7 @@ void Manager::addDestination(std::unique_ptr Dest) { Destinations.push_back(std::move(Dest)); } +Error Manager::preDispatch(TelemetryInfo *Entry) { return Error::success(); } + } // namespace telemetry } // namespace llvm From d19187f5fe01c89a09c1b2f14849a3f29988d6d5 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 17 Feb 2025 18:22:48 -0800 Subject: [PATCH 067/127] [AMDGPU] Move into SIProgramInfo and cache getFunctionCodeSize. NFCI. (#127111) This moves function as is, improvements to the estimate go into a subseqent patch. --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 26 +++----------------- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 2 -- llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 27 +++++++++++++++++++++ llvm/lib/Target/AMDGPU/SIProgramInfo.h | 6 +++++ 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 031d8f0560ff2..a8d0bb746d2ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -748,7 +748,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal) ->getVariableValue(), - getFunctionCodeSize(MF), MFI); + CurrentProgramInfo.getFunctionCodeSize(MF), MFI); return false; } @@ -757,7 +757,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { CurrentProgramInfo.NumArchVGPR, STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr, CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, - CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); + CurrentProgramInfo.ScratchSize, + CurrentProgramInfo.getFunctionCodeSize(MF), MFI); OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); @@ -893,27 +894,6 @@ void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { } } -uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { - const GCNSubtarget &STM = MF.getSubtarget(); - const SIInstrInfo *TII = STM.getInstrInfo(); - - uint64_t CodeSize = 0; - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - // TODO: CodeSize should account for multiple functions. - - // TODO: Should we count size of debug info? - if (MI.isDebugInstr()) - continue; - - CodeSize += TII->getInstSizeInBytes(MI); - } - } - - return CodeSize; -} - // AccumOffset computed for the MCExpr equivalent of: // alignTo(std::max(1, NumVGPR), 4) / 4 - 1; static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index cc8c4411805e2..2c959d7dbbd07 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -50,8 +50,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter { MCCodeEmitter *DumpCodeInstEmitter = nullptr; - uint64_t getFunctionCodeSize(const MachineFunction &MF) const; - void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out, const SIProgramInfo &KernelInfo, diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 212edff097837..5179288084010 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -27,6 +27,8 @@ void SIProgramInfo::reset(const MachineFunction &MF) { const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx); + CodeSizeInBytes.reset(); + VGPRBlocks = ZeroExpr; SGPRBlocks = ZeroExpr; Priority = 0; @@ -199,3 +201,28 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } + +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { + if (CodeSizeInBytes.has_value()) + return *CodeSizeInBytes; + + const GCNSubtarget &STM = MF.getSubtarget(); + const SIInstrInfo *TII = STM.getInstrInfo(); + + uint64_t CodeSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: CodeSize should account for multiple functions. + + // TODO: Should we count size of debug info? + if (MI.isDebugInstr()) + continue; + + CodeSize += TII->getInstSizeInBytes(MI); + } + } + + CodeSizeInBytes = CodeSize; + return CodeSize; +} diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 37c03d9b637f0..d7087436ae758 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -19,6 +19,7 @@ #include "llvm/IR/CallingConv.h" #include "llvm/Support/Compiler.h" #include +#include namespace llvm { @@ -29,6 +30,8 @@ class MachineFunction; /// Track resource usage for kernels / entry functions. struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { + std::optional CodeSizeInBytes; + // Fields set in PGM_RSRC1 pm4 packet. const MCExpr *VGPRBlocks = nullptr; const MCExpr *SGPRBlocks = nullptr; @@ -97,6 +100,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { // non-MCExpr members. void reset(const MachineFunction &MF); + // Get function code size and cache the value. + uint64_t getFunctionCodeSize(const MachineFunction &MF); + /// Compute the value of the ComputePGMRsrc1 register. const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const; From f71b83b3593588c56fd4ab3e1347ad9c7bec624f Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 17 Feb 2025 18:26:18 -0800 Subject: [PATCH 068/127] [lldb] Add a release note for #127419 --- llvm/docs/ReleaseNotes.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index c9543ff09217a..75638d75e70f2 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -155,6 +155,8 @@ Changes to LLDB does not provide API to query the number of supported hardware watchpoints. Therefore current implementation allows only 1 watchpoint, as tested with Windows 11 on the Microsoft SQ2 and Snapdragon Elite X platforms. +* LLDB now steps through C++ thunks. This fixes an issue where previously, it + wouldn't step into multiple inheritance virtual functions. ### Changes to lldb-dap From fe1ef413ab3634cf9e10bcd68f0633b28d7e2228 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 10:31:46 +0700 Subject: [PATCH 069/127] AMDGPU: Add more tests for peephole-opt immediate folding (#127480) --- .../test/CodeGen/AMDGPU/peephole-fold-imm.mir | 214 ++++++++++++++++++ 1 file changed, 214 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir index d070a8ef5dd2d..cceed6fd008e4 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir @@ -344,3 +344,217 @@ body: | %3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, %1, 0, %2.sub1, 0, 0, implicit $mode, implicit $exec SI_RETURN_TO_EPILOG %3 ... + +--- +name: fold_aimm_virtual +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_virtual + ; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_ACCVGPR_WRITE_B32_e64_]] + %0:agpr_32 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec + %1:agpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 + +... + +--- +name: fold_aimm_virtual_copy_to_vgpr +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_virtual_copy_to_vgpr + ; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 64, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B32_e32_]] + %0:agpr_32 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 + +... + +--- +name: fold_v_mov_b64_64_sub0_to_vgpr_32 +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_mov_b64_64_sub0_to_vgpr_32 + ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1412567312, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] + %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + %1:vgpr_32 = COPY killed %0.sub0 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_v_mov_b64_64_sub1_to_vgpr_32 +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_mov_b64_64_sub1_to_vgpr_32 + ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 305419896, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] + %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + %1:vgpr_32 = COPY killed %0.sub1 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_v_mov_b64_64 +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_mov_b64_64 + ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] + %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + %1:vreg_64_align2 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 + +... + +# FIXME: +# --- +# name: fold_v_mov_b64_64_to_unaligned +# body: | +# bb.0: +# %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec +# %1:vreg_64 = COPY killed %0 +# SI_RETURN_TO_EPILOG implicit %1 +# ... + +# FIXME: +# --- +# name: fold_v_mov_b64_pseudo_64_to_unaligned +# body: | +# bb.0: +# %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec +# %1:vreg_64 = COPY killed %0 +# SI_RETURN_TO_EPILOG implicit %1 +# ... + +--- +name: fold_s_brev_b32_simm_virtual_0 +body: | + bb.0: + + ; GCN-LABEL: name: fold_s_brev_b32_simm_virtual_0 + ; GCN: [[S_BREV_B32_:%[0-9]+]]:sreg_32 = S_BREV_B32 1 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_BREV_B32_]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG + %0:sreg_32 = S_BREV_B32 1 + %1:sreg_32 = COPY killed %0 + SI_RETURN_TO_EPILOG + +... + +--- +name: fold_s_brev_b32_simm_virtual_1 +body: | + bb.0: + + ; GCN-LABEL: name: fold_s_brev_b32_simm_virtual_1 + ; GCN: [[S_BREV_B32_:%[0-9]+]]:sreg_32 = S_BREV_B32 -64 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_BREV_B32_]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG + %0:sreg_32 = S_BREV_B32 -64 + %1:sreg_32 = COPY killed %0 + SI_RETURN_TO_EPILOG + +... + +--- +name: fold_v_bfrev_b32_e32_imm +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_bfrev_b32_e32_imm + ; GCN: [[V_BFREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_BFREV_B32_e32_]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + %0:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_v_bfrev_b32_e64_imm +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_bfrev_b32_e64_imm + ; GCN: [[V_BFREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 1, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_BFREV_B32_e64_]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + %0:vgpr_32 = V_BFREV_B32_e64 1, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_s_not_b32_simm_virtual_0 +body: | + bb.0: + + ; GCN-LABEL: name: fold_s_not_b32_simm_virtual_0 + ; GCN: [[S_NOT_B32_:%[0-9]+]]:sreg_32 = S_NOT_B32 1, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_NOT_B32_]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG + %0:sreg_32 = S_NOT_B32 1, implicit-def $scc + %1:sreg_32 = COPY killed %0 + SI_RETURN_TO_EPILOG + +... + +--- +name: fold_s_not_b32_simm_virtual_1 +body: | + bb.0: + + ; GCN-LABEL: name: fold_s_not_b32_simm_virtual_1 + ; GCN: [[S_NOT_B32_:%[0-9]+]]:sreg_32 = S_NOT_B32 -64, implicit-def $scc + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_NOT_B32_]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG + %0:sreg_32 = S_NOT_B32 -64, implicit-def $scc + %1:sreg_32 = COPY killed %0 + SI_RETURN_TO_EPILOG + +... + +--- +name: fold_v_not_b32_e32_imm +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_not_b32_e32_imm + ; GCN: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 1, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_NOT_B32_e32_]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + %0:vgpr_32 = V_NOT_B32_e32 1, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_v_not_b32_e64_imm +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_not_b32_e64_imm + ; GCN: [[V_NOT_B32_e64_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e64 1, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_NOT_B32_e64_]] + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + %0:vgpr_32 = V_NOT_B32_e64 1, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG %1 + +... From 4dee305ce2c92fbffd51ac1948e5916bccf2c9cb Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 10:34:48 +0700 Subject: [PATCH 070/127] AMDGPU: Fix foldImmediate breaking register class constraints (#127481) This fixes a verifier error when folding an immediate materialized into an aligned vgpr class into a copy to an unaligned virtual register. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 11 +++-- .../test/CodeGen/AMDGPU/peephole-fold-imm.mir | 43 +++++++++++-------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8481c6333f479..f51527d0eb148 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3473,14 +3473,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, assert(UseMI.getOperand(1).getReg().isVirtual()); } + MachineFunction *MF = UseMI.getMF(); const MCInstrDesc &NewMCID = get(NewOpc); - if (DstReg.isPhysical() && - !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg)) + const TargetRegisterClass *NewDefRC = getRegClass(NewMCID, 0, &RI, *MF); + + if (DstReg.isPhysical()) { + if (!NewDefRC->contains(DstReg)) + return false; + } else if (!MRI->constrainRegClass(DstReg, NewDefRC)) return false; UseMI.setDesc(NewMCID); UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); - UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); + UseMI.addImplicitDefUseOperands(*MF); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir index cceed6fd008e4..227af34f3fa6f 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir @@ -419,25 +419,30 @@ body: | ... -# FIXME: -# --- -# name: fold_v_mov_b64_64_to_unaligned -# body: | -# bb.0: -# %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec -# %1:vreg_64 = COPY killed %0 -# SI_RETURN_TO_EPILOG implicit %1 -# ... - -# FIXME: -# --- -# name: fold_v_mov_b64_pseudo_64_to_unaligned -# body: | -# bb.0: -# %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec -# %1:vreg_64 = COPY killed %0 -# SI_RETURN_TO_EPILOG implicit %1 -# ... +--- +name: fold_v_mov_b64_64_to_unaligned +body: | + bb.0: + ; GCN-LABEL: name: fold_v_mov_b64_64_to_unaligned + ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] + %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + %1:vreg_64 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 +... + +--- +name: fold_v_mov_b64_pseudo_64_to_unaligned +body: | + bb.0: + ; GCN-LABEL: name: fold_v_mov_b64_pseudo_64_to_unaligned + ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] + %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + %1:vreg_64 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 +... --- name: fold_s_brev_b32_simm_virtual_0 From af1e2a374e3845454914348793341f4f931e805a Mon Sep 17 00:00:00 2001 From: yingopq <115543042+yingopq@users.noreply.github.com> Date: Tue, 18 Feb 2025 11:57:23 +0800 Subject: [PATCH 071/127] [Mips] Support llvm.readcyclecounter intrinsic (#127553) The llvm.readcyclecounter intrinsic can be implemented via the `rdhwr $2, $hwr_cc` instruction. $hwr_cc: High-resolution cycle counter. This register provides read access to the coprocessor 0 Count Register. Fix #106318. --- llvm/lib/Target/Mips/MipsISelLowering.cpp | 44 +++++++++++++++++++++ llvm/lib/Target/Mips/MipsISelLowering.h | 1 + llvm/test/CodeGen/Mips/readcyclecounter.ll | 45 ++++++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 llvm/test/CodeGen/Mips/readcyclecounter.ll diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index afec52e289e22..e737c5aeb43c6 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -357,6 +357,10 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + if (Subtarget.hasMips32r2() || + getTargetMachine().getTargetTriple().isOSLinux()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); + // Lower fmin/fmax/fclass operations for MIPS R6. if (Subtarget.hasMips32r6()) { setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); @@ -1315,6 +1319,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::STORE: return lowerSTORE(Op, DAG); case ISD::EH_DWARF_CFA: return lowerEH_DWARF_CFA(Op, DAG); case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG); + case ISD::READCYCLECOUNTER: + return lowerREADCYCLECOUNTER(Op, DAG); } return SDValue(); } @@ -2096,6 +2102,44 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword( return exitMBB; } +SDValue MipsTargetLowering::lowerREADCYCLECOUNTER(SDValue Op, + SelectionDAG &DAG) const { + SmallVector Results; + SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + unsigned RdhwrOpc, DestReg; + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + + if (PtrVT == MVT::i64) { + RdhwrOpc = Mips::RDHWR64; + DestReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); + SDNode *Rdhwr = DAG.getMachineNode(RdhwrOpc, DL, MVT::i64, MVT::Glue, + DAG.getRegister(Mips::HWR2, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32)); + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, DestReg, + SDValue(Rdhwr, 0), SDValue(Rdhwr, 1)); + SDValue ResNode = + DAG.getCopyFromReg(Chain, DL, DestReg, MVT::i64, Chain.getValue(1)); + Results.push_back(ResNode); + Results.push_back(ResNode.getValue(1)); + } else { + RdhwrOpc = Mips::RDHWR; + DestReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32)); + SDNode *Rdhwr = DAG.getMachineNode(RdhwrOpc, DL, MVT::i32, MVT::Glue, + DAG.getRegister(Mips::HWR2, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32)); + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, DestReg, + SDValue(Rdhwr, 0), SDValue(Rdhwr, 1)); + SDValue ResNode = + DAG.getCopyFromReg(Chain, DL, DestReg, MVT::i32, Chain.getValue(1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResNode, + DAG.getConstant(0, DL, MVT::i32))); + Results.push_back(ResNode.getValue(1)); + } + + return DAG.getMergeValues(Results, DL); +} + SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // The first operand is the chain, the second is the condition, the third is // the block to branch to if the condition is true. diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index ee1ab6a17a91e..1d5f5e663d531 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -591,6 +591,7 @@ class TargetRegisterClass; bool IsSRA) const; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; /// isEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. diff --git a/llvm/test/CodeGen/Mips/readcyclecounter.ll b/llvm/test/CodeGen/Mips/readcyclecounter.ll new file mode 100644 index 0000000000000..467dd92884b3d --- /dev/null +++ b/llvm/test/CodeGen/Mips/readcyclecounter.ll @@ -0,0 +1,45 @@ +;RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips32r2 < %s | FileCheck %s --check-prefix=MIPSEL +;RUN: llc -mtriple=mips64el-linux-gnuabi64 -mcpu=mips64r2 < %s | FileCheck %s --check-prefix=MIPS64EL +;RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips2 < %s | FileCheck %s --check-prefix=MIPSEL +;RUN: llc -mtriple=mips64el-linux-gnuabi64 -mcpu=mips3 < %s | FileCheck %s --check-prefix=MIPS64EL +;RUN: llc -mtriple=mipsel -mcpu=mips32r2 < %s | FileCheck %s --check-prefix=MIPSEL +;RUN: llc -mtriple=mips64el -mcpu=mips64r2 < %s | FileCheck %s --check-prefix=MIPS64EL +;RUN: llc -mtriple=mipsel -mcpu=mips2 < %s | FileCheck %s --check-prefix=MIPSEL_NOT_SUPPORTED +;RUN: llc -mtriple=mips64el -mcpu=mips3 < %s | FileCheck %s --check-prefix=MIPS64EL_NOT_SUPPORTED + +declare i64 @llvm.readcyclecounter() nounwind readnone + +define i64 @test_readcyclecounter() nounwind { +; MIPSEL-LABEL: test_readcyclecounter: +; MIPSEL: # %bb.0: # %entry +; MIPSEL-NEXT: .set push +; MIPSEL-NEXT: .set mips32r2 +; MIPSEL-NEXT: rdhwr $2, $hwr_cc +; MIPSEL-NEXT: .set pop +; MIPSEL-NEXT: jr $ra +; MIPSEL-NEXT: addiu $3, $zero, 0 +; +; MIPSEL_NOT_SUPPORTED-LABEL: test_readcyclecounter: +; MIPSEL_NOT_SUPPORTED: # %bb.0: # %entry +; MIPSEL_NOT_SUPPORTED-NEXT: addiu $2, $zero, 0 +; MIPSEL_NOT_SUPPORTED-NEXT: jr $ra +; MIPSEL_NOT_SUPPORTED-NEXT: addiu $3, $zero, 0 +; +; MIPS64EL-LABEL: test_readcyclecounter: +; MIPS64EL: # %bb.0: # %entry +; MIPS64EL-NEXT: .set push +; MIPS64EL-NEXT: .set mips32r2 +; MIPS64EL-NEXT: rdhwr $2, $hwr_cc +; MIPS64EL-NEXT: .set pop +; MIPS64EL-NEXT: jr $ra +; MIPS64EL-NEXT: nop +; +; MIPS64EL_NOT_SUPPORTED-LABEL: test_readcyclecounter: +; MIPS64EL_NOT_SUPPORTED: # %bb.0: # %entry +; MIPS64EL_NOT_SUPPORTED-NEXT: jr $ra +; MIPS64EL_NOT_SUPPORTED-NEXT: daddiu $2, $zero, 0 +entry: + %tmp0 = tail call i64 @llvm.readcyclecounter() + ret i64 %tmp0 +} + From 83d7f4b8c38147dbb57a40b385e70908ebbbb554 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 11:21:02 +0700 Subject: [PATCH 072/127] AMDGPU: Implement getConstValDefinedInReg and use in foldImmediate (NFC) (#127482) This is NFC because it currently only matters for cases that are not isMoveImmediate, and we do not yet implement any of those. This just moves the implementation of foldImmediate to use the common interface, similar to how x86 does it. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 67 ++++++++++++++------------ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 22 +++++++-- 2 files changed, 54 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f51527d0eb148..9e99df7524f4d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1327,6 +1327,33 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, return Reg; } +bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI, + const Register Reg, + int64_t &ImmVal) const { + // TODO: Handle all the special cases handled in SIShrinkInstructions + // (e.g. s_brev_b32 imm -> reverse(imm)) + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOVK_I32: + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B64_e32: + case AMDGPU::V_ACCVGPR_WRITE_B32_e64: + case AMDGPU::S_MOV_B64_IMM_PSEUDO: + case AMDGPU::V_MOV_B64_PSEUDO: { + const MachineOperand &Src0 = MI.getOperand(1); + if (Src0.isImm()) { + ImmVal = Src0.getImm(); + return MI.getOperand(0).getReg() == Reg; + } + + return false; + } + default: + return false; + } +} + unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (RI.isAGPRClass(DstRC)) @@ -3395,27 +3422,11 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!MRI->hasOneNonDBGUse(Reg)) return false; - switch (DefMI.getOpcode()) { - default: - return false; - case AMDGPU::V_MOV_B64_e32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B64_PSEUDO: - case AMDGPU::S_MOV_B64_IMM_PSEUDO: - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::S_MOV_B32: - case AMDGPU::V_ACCVGPR_WRITE_B32_e64: - break; - } - - const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); - assert(ImmOp); - // FIXME: We could handle FrameIndex values here. - if (!ImmOp->isImm()) + int64_t Imm; + if (!getConstValDefinedInReg(DefMI, Reg, Imm)) return false; - auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t { - int64_t Imm = ImmOp->getImm(); + auto getImmFor = [=](const MachineOperand &UseOp) -> int64_t { switch (UseOp.getSubReg()) { default: return Imm; @@ -3502,12 +3513,14 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // If this is a free constant, there's no reason to do this. // TODO: We could fold this here instead of letting SIFoldOperands do it // later. - MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); + int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0); // Any src operand can be used for the legality check. - if (isInlineConstant(UseMI, *Src0, *ImmOp)) + if (isInlineConstant(UseMI, Src0Idx, Imm)) return false; + MachineOperand *Src0 = &UseMI.getOperand(Src0Idx); + bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; bool IsFMA = @@ -4267,18 +4280,11 @@ bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const { } } -bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, - uint8_t OperandType) const { - assert(!MO.isReg() && "isInlineConstant called on register operand!"); - if (!MO.isImm()) - return false; - +bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const { // MachineOperand provides no way to tell the true operand size, since it only // records a 64-bit value. We need to know the size to determine if a 32-bit // floating point immediate bit pattern is legal for an integer immediate. It // would be for any 32-bit integer operand, but would not be for a 64-bit one. - - int64_t Imm = MO.getImm(); switch (OperandType) { case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: @@ -4300,8 +4306,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - return AMDGPU::isInlinableLiteral64(MO.getImm(), - ST.hasInv2PiInlineImm()); + return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm()); case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 811e4fcbebf57..ddd15e1766f70 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -278,6 +278,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const; + bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, + int64_t &ImmVal) const override; + void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, @@ -1063,7 +1066,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { // Some operands like FrameIndexes could resolve to an inline immediate value // that will not require an additional 4-bytes; this function assumes that it // will. - bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; + bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { + assert(!MO.isReg() && "isInlineConstant called on register operand!"); + if (!MO.isImm()) + return false; + return isInlineConstant(MO.getImm(), OperandType); + } + bool isInlineConstant(int64_t ImmVal, uint8_t OperandType) const; bool isInlineConstant(const MachineOperand &MO, const MCOperandInfo &OpInfo) const { @@ -1091,7 +1100,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { } bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx, - const MachineOperand &MO) const { + int64_t ImmVal) const { if (OpIdx >= MI.getDesc().NumOperands) return false; @@ -1101,10 +1110,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { uint8_t OpType = (Size == 8) ? AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32; - return isInlineConstant(MO, OpType); + return isInlineConstant(ImmVal, OpType); } - return isInlineConstant(MO, MI.getDesc().operands()[OpIdx].OperandType); + return isInlineConstant(ImmVal, MI.getDesc().operands()[OpIdx].OperandType); + } + + bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx, + const MachineOperand &MO) const { + return isInlineConstant(MI, OpIdx, MO.getImm()); } bool isInlineConstant(const MachineOperand &MO) const { From c5def84ca4a1aa08333a0428bc453ea901139eca Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 11:23:49 +0700 Subject: [PATCH 073/127] AMDGPU: Handle brev and not cases in getConstValDefinedInReg (#127483) We should not encounter these cases in the peephole-opt use today, but get the common helper function to handle these. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 24 +++++++++++++++++-- .../test/CodeGen/AMDGPU/peephole-fold-imm.mir | 24 +++++++++---------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9e99df7524f4d..4ee5ebd7681b8 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1330,8 +1330,6 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const { - // TODO: Handle all the special cases handled in SIShrinkInstructions - // (e.g. s_brev_b32 imm -> reverse(imm)) switch (MI.getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::S_MOV_B32: @@ -1349,6 +1347,28 @@ bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI, return false; } + case AMDGPU::S_BREV_B32: + case AMDGPU::V_BFREV_B32_e32: + case AMDGPU::V_BFREV_B32_e64: { + const MachineOperand &Src0 = MI.getOperand(1); + if (Src0.isImm()) { + ImmVal = static_cast(reverseBits(Src0.getImm())); + return MI.getOperand(0).getReg() == Reg; + } + + return false; + } + case AMDGPU::S_NOT_B32: + case AMDGPU::V_NOT_B32_e32: + case AMDGPU::V_NOT_B32_e64: { + const MachineOperand &Src0 = MI.getOperand(1); + if (Src0.isImm()) { + ImmVal = static_cast(~static_cast(Src0.getImm())); + return MI.getOperand(0).getReg() == Reg; + } + + return false; + } default: return false; } diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir index 227af34f3fa6f..ddeb45a48a6ee 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir @@ -451,7 +451,7 @@ body: | ; GCN-LABEL: name: fold_s_brev_b32_simm_virtual_0 ; GCN: [[S_BREV_B32_:%[0-9]+]]:sreg_32 = S_BREV_B32 1 - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_BREV_B32_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 ; GCN-NEXT: SI_RETURN_TO_EPILOG %0:sreg_32 = S_BREV_B32 1 %1:sreg_32 = COPY killed %0 @@ -466,7 +466,7 @@ body: | ; GCN-LABEL: name: fold_s_brev_b32_simm_virtual_1 ; GCN: [[S_BREV_B32_:%[0-9]+]]:sreg_32 = S_BREV_B32 -64 - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_BREV_B32_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 67108863 ; GCN-NEXT: SI_RETURN_TO_EPILOG %0:sreg_32 = S_BREV_B32 -64 %1:sreg_32 = COPY killed %0 @@ -481,8 +481,8 @@ body: | ; GCN-LABEL: name: fold_v_bfrev_b32_e32_imm ; GCN: [[V_BFREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_BFREV_B32_e32_]] - ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] %0:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec %1:vgpr_32 = COPY killed %0 SI_RETURN_TO_EPILOG %1 @@ -496,8 +496,8 @@ body: | ; GCN-LABEL: name: fold_v_bfrev_b32_e64_imm ; GCN: [[V_BFREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 1, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_BFREV_B32_e64_]] - ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] %0:vgpr_32 = V_BFREV_B32_e64 1, implicit $exec %1:vgpr_32 = COPY killed %0 SI_RETURN_TO_EPILOG %1 @@ -511,7 +511,7 @@ body: | ; GCN-LABEL: name: fold_s_not_b32_simm_virtual_0 ; GCN: [[S_NOT_B32_:%[0-9]+]]:sreg_32 = S_NOT_B32 1, implicit-def $scc - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_NOT_B32_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2 ; GCN-NEXT: SI_RETURN_TO_EPILOG %0:sreg_32 = S_NOT_B32 1, implicit-def $scc %1:sreg_32 = COPY killed %0 @@ -526,7 +526,7 @@ body: | ; GCN-LABEL: name: fold_s_not_b32_simm_virtual_1 ; GCN: [[S_NOT_B32_:%[0-9]+]]:sreg_32 = S_NOT_B32 -64, implicit-def $scc - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY killed [[S_NOT_B32_]] + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 63 ; GCN-NEXT: SI_RETURN_TO_EPILOG %0:sreg_32 = S_NOT_B32 -64, implicit-def $scc %1:sreg_32 = COPY killed %0 @@ -541,8 +541,8 @@ body: | ; GCN-LABEL: name: fold_v_not_b32_e32_imm ; GCN: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 1, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_NOT_B32_e32_]] - ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] %0:vgpr_32 = V_NOT_B32_e32 1, implicit $exec %1:vgpr_32 = COPY killed %0 SI_RETURN_TO_EPILOG %1 @@ -556,8 +556,8 @@ body: | ; GCN-LABEL: name: fold_v_not_b32_e64_imm ; GCN: [[V_NOT_B32_e64_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e64 1, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY killed [[V_NOT_B32_e64_]] - ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]] + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] %0:vgpr_32 = V_NOT_B32_e64 1, implicit $exec %1:vgpr_32 = COPY killed %0 SI_RETURN_TO_EPILOG %1 From ef9f0b3c414a5d55e694829514d7b2ff8736d3c3 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 17 Feb 2025 20:26:05 -0800 Subject: [PATCH 074/127] [DAGCombiner] Don't peek through truncates of shift amounts in takeInexpensiveLog2. (#126957) Shift amounts in SelectionDAG don't have to match the result type of the shift. SelectionDAGBuilder will aggressively truncate shift amounts to the target's preferred type. This may result in a zero extend that existed in IR being removed. If we look through a truncate here, we can't guarantee the upper bits of the truncate input are zero. There may have been a zext that was removed. Unfortunately, this regresses tests where no truncate was involved. The only way I can think to fix this is to add an assertzext when SelectionDAGBuilder truncates a shift amount or remove the early truncation of shift amounts from SelectionDAGBuilder all together. Fixes #126889. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 218 +++++++++++------- 2 files changed, 139 insertions(+), 85 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c6fd72b6b76f4..bc7cdf38dbc2a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -28446,7 +28446,11 @@ static SDValue takeInexpensiveLog2(SelectionDAG &DAG, const SDLoc &DL, EVT VT, return SDValue(); auto CastToVT = [&](EVT NewVT, SDValue ToCast) { - ToCast = PeekThroughCastsAndTrunc(ToCast); + // Peek through zero extend. We can't peek through truncates since this + // function is called on a shift amount. We must ensure that all of the bits + // above the original shift amount are zeroed by this function. + while (ToCast.getOpcode() == ISD::ZERO_EXTEND) + ToCast = ToCast.getOperand(0); EVT CurVT = ToCast.getValueType(); if (NewVT == CurVT) return ToCast; diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 53517373d3e4d..e513b666ebf83 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -660,21 +660,25 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { ret <8 x half> %r } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: shlq $52, %rdi -; CHECK-SSE-NEXT: movabsq $4621256167635550208, %rax # imm = 0x4022000000000000 -; CHECK-SSE-NEXT: addq %rdi, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000 +; CHECK-SSE-NEXT: addq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_shl_cnt: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: shlq $52, %rdi -; CHECK-AVX-NEXT: movabsq $4621256167635550208, %rax # imm = 0x4022000000000000 -; CHECK-AVX-NEXT: addq %rdi, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000 +; CHECK-AVX-NEXT: addq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double @@ -682,23 +686,27 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ret double %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: incl %edi -; CHECK-SSE-NEXT: shlq $52, %rdi -; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rax # imm = 0xC022000000000000 -; CHECK-SSE-NEXT: addq %rdi, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: incl %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-SSE-NEXT: addq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_shl_cnt2: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: incl %edi -; CHECK-AVX-NEXT: shlq $52, %rdi -; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rax # imm = 0xC022000000000000 -; CHECK-AVX-NEXT: addq %rdi, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: incl %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-AVX-NEXT: addq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i64 2, %cnt %conv = uitofp i64 %shl to double @@ -706,27 +714,55 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ret double %mul } +; Make sure we do a movzbl of the input register. +define double @fmul_pow_shl_cnt3(i8 %cnt) nounwind { +; CHECK-SSE-LABEL: fmul_pow_shl_cnt3: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-SSE-NEXT: addq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: fmul_pow_shl_cnt3: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-AVX-NEXT: addq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 +; CHECK-AVX-NEXT: retq + %zext_cnt = zext i8 %cnt to i64 + %shl = shl nuw i64 1, %zext_cnt + %conv = uitofp i64 %shl to double + %mul = fmul double -9.000000e+00, %conv + ret double %mul +} + +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ; CHECK-SSE-LABEL: fmul_pow_select: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-SSE-NEXT: leal 1(%rdi), %eax +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: leal 1(%rax), %ecx ; CHECK-SSE-NEXT: testb $1, %sil -; CHECK-SSE-NEXT: cmovnel %edi, %eax -; CHECK-SSE-NEXT: shll $23, %eax -; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-SSE-NEXT: movd %eax, %xmm0 +; CHECK-SSE-NEXT: cmovnel %eax, %ecx +; CHECK-SSE-NEXT: shll $23, %ecx +; CHECK-SSE-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-SSE-NEXT: movd %ecx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_select: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-AVX-NEXT: leal 1(%rdi), %eax +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: leal 1(%rax), %ecx ; CHECK-AVX-NEXT: testb $1, %sil -; CHECK-AVX-NEXT: cmovnel %edi, %eax -; CHECK-AVX-NEXT: shll $23, %eax -; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: cmovnel %eax, %ecx +; CHECK-AVX-NEXT: shll $23, %ecx +; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq %shl2 = shl nuw i32 2, %cnt %shl1 = shl nuw i32 1, %cnt @@ -736,27 +772,31 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ret float %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: addl $3, %edi -; CHECK-SSE-NEXT: cmpl $13, %edi -; CHECK-SSE-NEXT: movl $13, %eax -; CHECK-SSE-NEXT: cmovbl %edi, %eax -; CHECK-SSE-NEXT: shll $23, %eax -; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-SSE-NEXT: movd %eax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: addl $3, %eax +; CHECK-SSE-NEXT: cmpl $13, %eax +; CHECK-SSE-NEXT: movl $13, %ecx +; CHECK-SSE-NEXT: cmovbl %eax, %ecx +; CHECK-SSE-NEXT: shll $23, %ecx +; CHECK-SSE-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-SSE-NEXT: movd %ecx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: addl $3, %edi -; CHECK-AVX-NEXT: cmpl $13, %edi -; CHECK-AVX-NEXT: movl $13, %eax -; CHECK-AVX-NEXT: cmovbl %edi, %eax -; CHECK-AVX-NEXT: shll $23, %eax -; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: addl $3, %eax +; CHECK-AVX-NEXT: cmpl $13, %eax +; CHECK-AVX-NEXT: movl $13, %ecx +; CHECK-AVX-NEXT: cmovbl %eax, %ecx +; CHECK-AVX-NEXT: shll $23, %ecx +; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq %shl8 = shl nuw i64 8, %cnt %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) @@ -765,28 +805,30 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ret float %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %eax +; CHECK-SSE-NEXT: movzbl %dil, %eax ; CHECK-SSE-NEXT: leaq 1(%rax), %rcx ; CHECK-SSE-NEXT: cmpq %rcx, %rax ; CHECK-SSE-NEXT: cmovaq %rax, %rcx ; CHECK-SSE-NEXT: shlq $52, %rcx ; CHECK-SSE-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 -; CHECK-SSE-NEXT: addq %rcx, %rax +; CHECK-SSE-NEXT: orq %rcx, %rax ; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_mul_max_pow2: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: movl %edi, %eax +; CHECK-AVX-NEXT: movzbl %dil, %eax ; CHECK-AVX-NEXT: leaq 1(%rax), %rcx ; CHECK-AVX-NEXT: cmpq %rcx, %rax ; CHECK-AVX-NEXT: cmovaq %rax, %rcx ; CHECK-AVX-NEXT: shlq $52, %rcx ; CHECK-AVX-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 -; CHECK-AVX-NEXT: addq %rcx, %rax +; CHECK-AVX-NEXT: orq %rcx, %rax ; CHECK-AVX-NEXT: vmovq %rax, %xmm0 ; CHECK-AVX-NEXT: retq %shl2 = shl nuw i16 2, %cnt @@ -1161,23 +1203,25 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { ret double %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-SSE-NEXT: shlq $52, %rdi -; CHECK-SSE-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 -; CHECK-SSE-NEXT: addq %rdi, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992 +; CHECK-SSE-NEXT: addq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-AVX-NEXT: shlq $52, %rdi -; CHECK-AVX-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 -; CHECK-AVX-NEXT: addq %rdi, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992 +; CHECK-AVX-NEXT: addq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to double @@ -1236,15 +1280,15 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: testq %rax, %rax -; CHECK-SSE-NEXT: js .LBB22_1 +; CHECK-SSE-NEXT: js .LBB23_1 ; CHECK-SSE-NEXT: # %bb.2: ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-SSE-NEXT: jmp .LBB22_3 -; CHECK-SSE-NEXT: .LBB22_1: +; CHECK-SSE-NEXT: jmp .LBB23_3 +; CHECK-SSE-NEXT: .LBB23_1: ; CHECK-SSE-NEXT: shrq %rax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1 -; CHECK-SSE-NEXT: .LBB22_3: +; CHECK-SSE-NEXT: .LBB23_3: ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq @@ -1256,15 +1300,15 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: testq %rax, %rax -; CHECK-AVX2-NEXT: js .LBB22_1 +; CHECK-AVX2-NEXT: js .LBB23_1 ; CHECK-AVX2-NEXT: # %bb.2: ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: jmp .LBB22_3 -; CHECK-AVX2-NEXT: .LBB22_1: +; CHECK-AVX2-NEXT: jmp .LBB23_3 +; CHECK-AVX2-NEXT: .LBB23_1: ; CHECK-AVX2-NEXT: shrq %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: .LBB22_3: +; CHECK-AVX2-NEXT: .LBB23_3: ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1545,23 +1589,25 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ret half %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-SSE-NEXT: shlq $52, %rdi -; CHECK-SSE-NEXT: movabsq $3936146074321813504, %rax # imm = 0x36A0000000000000 -; CHECK-SSE-NEXT: subq %rdi, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000 +; CHECK-SSE-NEXT: subq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-AVX-NEXT: shlq $52, %rdi -; CHECK-AVX-NEXT: movabsq $3936146074321813504, %rax # imm = 0x36A0000000000000 -; CHECK-AVX-NEXT: subq %rdi, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000 +; CHECK-AVX-NEXT: subq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to double @@ -1617,21 +1663,25 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ret float %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: shll $23, %edi -; CHECK-SSE-NEXT: movl $285212672, %eax # imm = 0x11000000 -; CHECK-SSE-NEXT: subl %edi, %eax -; CHECK-SSE-NEXT: movd %eax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shll $23, %eax +; CHECK-SSE-NEXT: movl $285212672, %ecx # imm = 0x11000000 +; CHECK-SSE-NEXT: subl %eax, %ecx +; CHECK-SSE-NEXT: movd %ecx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: shll $23, %edi -; CHECK-AVX-NEXT: movl $285212672, %eax # imm = 0x11000000 -; CHECK-AVX-NEXT: subl %edi, %eax -; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shll $23, %eax +; CHECK-AVX-NEXT: movl $285212672, %ecx # imm = 0x11000000 +; CHECK-AVX-NEXT: subl %eax, %ecx +; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float From 663db5c70dfef8961dfb0ef5408db48555de7afc Mon Sep 17 00:00:00 2001 From: Vikram Hegde <115221833+vikramRH@users.noreply.github.com> Date: Tue, 18 Feb 2025 11:13:31 +0530 Subject: [PATCH 075/127] [AMDGPU][NewPM] Port GCNNSAReassign pass to new pass manager (#125034) tests to be added while porting virtregrewrite and greedy regalloc --- llvm/lib/Target/AMDGPU/AMDGPU.h | 2 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 9 +- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 + llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp | 96 ++++++++++++------- llvm/lib/Target/AMDGPU/GCNNSAReassign.h | 22 +++++ 6 files changed, 93 insertions(+), 39 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/GCNNSAReassign.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4a0e5ef58ac93..42392e22643b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -448,7 +448,7 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringLegacyID; -void initializeGCNNSAReassignPass(PassRegistry &); +void initializeGCNNSAReassignLegacyPass(PassRegistry &); extern char &GCNNSAReassignID; void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 225f84725874b..fd1341e8c91b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -100,6 +100,7 @@ MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass()) MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) +MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) @@ -120,7 +121,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) #define DUMMY_MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass()) -DUMMY_MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 92ab106dd4a98..7c9377e61230b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -32,6 +32,7 @@ #include "AMDGPUWaitSGPRHazards.h" #include "GCNDPPCombine.h" #include "GCNIterativeScheduler.h" +#include "GCNNSAReassign.h" #include "GCNPreRALongBranchReg.h" #include "GCNPreRAOptimizations.h" #include "GCNRewritePartialRegUses.h" @@ -550,7 +551,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUImageIntrinsicOptimizerPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeAMDGPUResourceUsageAnalysisPass(*PR); - initializeGCNNSAReassignPass(*PR); + initializeGCNNSAReassignLegacyPass(*PR); initializeGCNPreRAOptimizationsLegacyPass(*PR); initializeGCNPreRALongBranchRegLegacyPass(*PR); initializeGCNRewritePartialRegUsesLegacyPass(*PR); @@ -2112,6 +2113,12 @@ Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const { return Error::success(); } +void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const { + if (EnableRegReassign) { + addPass(GCNNSAReassignPass()); + } +} + void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( AddMachinePass &addPass) const { Base::addMachineSSAOptimization(addPass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 1455494d0ef7d..eb5a9ca1f86d6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -177,6 +177,7 @@ class AMDGPUCodeGenPassBuilder void addILPOpts(AddMachinePass &) const; void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const; Error addInstSelector(AddMachinePass &) const; + void addPreRewrite(AddMachinePass &) const; void addMachineSSAOptimization(AddMachinePass &) const; void addPostRegAlloc(AddMachinePass &) const; diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 85e79aa4b7595..13eb0ca539a4c 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -13,6 +13,7 @@ /// //===----------------------------------------------------------------------===// +#include "GCNNSAReassign.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" @@ -34,26 +35,12 @@ STATISTIC(NumNSAConverted, "Number of NSA instructions changed to sequential"); namespace { - -class GCNNSAReassign : public MachineFunctionPass { +class GCNNSAReassignImpl { public: - static char ID; - - GCNNSAReassign() : MachineFunctionPass(ID) { - initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; + GCNNSAReassignImpl(VirtRegMap *VM, LiveRegMatrix *LM, LiveIntervals *LS) + : VRM(VM), LRM(LM), LIS(LS) {} - StringRef getPassName() const override { return "GCN NSA Reassign"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } + bool run(MachineFunction &MF); private: using NSA_Status = enum { @@ -90,24 +77,43 @@ class GCNNSAReassign : public MachineFunctionPass { bool scavengeRegs(SmallVectorImpl &Intervals) const; }; +class GCNNSAReassignLegacy : public MachineFunctionPass { +public: + static char ID; + + GCNNSAReassignLegacy() : MachineFunctionPass(ID) { + initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN NSA Reassign"; }; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", +INITIALIZE_PASS_BEGIN(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) -INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", - false, false) - +INITIALIZE_PASS_END(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", false, + false) -char GCNNSAReassign::ID = 0; +char GCNNSAReassignLegacy::ID = 0; -char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; +char &llvm::GCNNSAReassignID = GCNNSAReassignLegacy::ID; -bool -GCNNSAReassign::tryAssignRegisters(SmallVectorImpl &Intervals, - unsigned StartReg) const { +bool GCNNSAReassignImpl::tryAssignRegisters( + SmallVectorImpl &Intervals, unsigned StartReg) const { unsigned NumRegs = Intervals.size(); for (unsigned N = 0; N < NumRegs; ++N) @@ -124,7 +130,7 @@ GCNNSAReassign::tryAssignRegisters(SmallVectorImpl &Intervals, return true; } -bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { +bool GCNNSAReassignImpl::canAssign(unsigned StartReg, unsigned NumRegs) const { for (unsigned N = 0; N < NumRegs; ++N) { unsigned Reg = StartReg + N; if (!MRI->isAllocatable(Reg)) @@ -139,8 +145,8 @@ bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { return true; } -bool -GCNNSAReassign::scavengeRegs(SmallVectorImpl &Intervals) const { +bool GCNNSAReassignImpl::scavengeRegs( + SmallVectorImpl &Intervals) const { unsigned NumRegs = Intervals.size(); if (NumRegs > MaxNumVGPRs) @@ -158,8 +164,8 @@ GCNNSAReassign::scavengeRegs(SmallVectorImpl &Intervals) const { return false; } -GCNNSAReassign::NSA_Status -GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { +GCNNSAReassignImpl::NSA_Status +GCNNSAReassignImpl::CheckNSA(const MachineInstr &MI, bool Fast) const { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); if (!Info) return NSA_Status::NOT_NSA; @@ -235,16 +241,13 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; } -bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { +bool GCNNSAReassignImpl::run(MachineFunction &MF) { ST = &MF.getSubtarget(); if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding()) return false; MRI = &MF.getRegInfo(); TRI = ST->getRegisterInfo(); - VRM = &getAnalysis().getVRM(); - LRM = &getAnalysis().getLRM(); - LIS = &getAnalysis().getLIS(); const SIMachineFunctionInfo *MFI = MF.getInfo(); MaxNumVGPRs = ST->getMaxNumVGPRs(MF); @@ -367,3 +370,24 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { return Changed; } + +bool GCNNSAReassignLegacy::runOnMachineFunction(MachineFunction &MF) { + auto *VRM = &getAnalysis().getVRM(); + auto *LRM = &getAnalysis().getLRM(); + auto *LIS = &getAnalysis().getLIS(); + + GCNNSAReassignImpl Impl(VRM, LRM, LIS); + return Impl.run(MF); +} + +PreservedAnalyses +GCNNSAReassignPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto &VRM = MFAM.getResult(MF); + auto &LRM = MFAM.getResult(MF); + auto &LIS = MFAM.getResult(MF); + + GCNNSAReassignImpl Impl(&VRM, &LRM, &LIS); + Impl.run(MF); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.h b/llvm/lib/Target/AMDGPU/GCNNSAReassign.h new file mode 100644 index 0000000000000..97a72e7ddbb24 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.h @@ -0,0 +1,22 @@ +//===- GCNNSAReassign.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNNSAREASSIGN_H +#define LLVM_LIB_TARGET_AMDGPU_GCNNSAREASSIGN_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { +class GCNNSAReassignPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNNSAREASSIGN_H From 0b719d3d63100c6af66b015f796ab74d3d218107 Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Tue, 18 Feb 2025 00:59:45 -0500 Subject: [PATCH 076/127] [clangd] Enable parsing of forwarding functions in the preamble by default (#127359) Fixes https://github.com/clangd/clangd/issues/2324 --- clang-tools-extra/clangd/ClangdServer.h | 4 ++-- clang-tools-extra/clangd/Compiler.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h index e030bf04122d5..1e612e2ba618e 100644 --- a/clang-tools-extra/clangd/ClangdServer.h +++ b/clang-tools-extra/clangd/ClangdServer.h @@ -184,7 +184,7 @@ class ClangdServer { bool UseDirtyHeaders = false; // If true, parse emplace-like functions in the preamble. - bool PreambleParseForwardingFunctions = false; + bool PreambleParseForwardingFunctions = true; /// Whether include fixer insertions for Objective-C code should use #import /// instead of #include. @@ -501,7 +501,7 @@ class ClangdServer { // Whether the client supports folding only complete lines. bool LineFoldingOnly = false; - bool PreambleParseForwardingFunctions = false; + bool PreambleParseForwardingFunctions = true; bool ImportInsertions = false; diff --git a/clang-tools-extra/clangd/Compiler.h b/clang-tools-extra/clangd/Compiler.h index 4e68da7610ca2..e513e4c40794a 100644 --- a/clang-tools-extra/clangd/Compiler.h +++ b/clang-tools-extra/clangd/Compiler.h @@ -40,7 +40,7 @@ class IgnoreDiagnostics : public DiagnosticConsumer { // Options to run clang e.g. when parsing AST. struct ParseOptions { - bool PreambleParseForwardingFunctions = false; + bool PreambleParseForwardingFunctions = true; bool ImportInsertions = false; }; From d09cce166de9fc4fa243bdb4a2ea22df08110abd Mon Sep 17 00:00:00 2001 From: Nathan Ridge Date: Tue, 18 Feb 2025 01:13:35 -0500 Subject: [PATCH 077/127] [clang][Index] Use HeuristicResolver in libIndex (#125153) The uses replace hand-rolled code that did a subset of what HeuristicResolver does. --- clang/lib/Index/CMakeLists.txt | 1 + clang/lib/Index/IndexBody.cpp | 47 ++++++------------- clang/lib/Index/IndexingContext.cpp | 12 +++++ clang/lib/Index/IndexingContext.h | 10 ++-- .../Index/Core/index-dependent-source.cpp | 22 ++++----- 5 files changed, 45 insertions(+), 47 deletions(-) diff --git a/clang/lib/Index/CMakeLists.txt b/clang/lib/Index/CMakeLists.txt index b4e294304f115..f0d2b579c8df6 100644 --- a/clang/lib/Index/CMakeLists.txt +++ b/clang/lib/Index/CMakeLists.txt @@ -23,6 +23,7 @@ add_clang_library(clangIndex clangFormat clangFrontend clangLex + clangSema clangSerialization clangToolingCore diff --git a/clang/lib/Index/IndexBody.cpp b/clang/lib/Index/IndexBody.cpp index f1dc4d5831ce7..5e69987820730 100644 --- a/clang/lib/Index/IndexBody.cpp +++ b/clang/lib/Index/IndexBody.cpp @@ -13,6 +13,7 @@ #include "clang/AST/ExprConcepts.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Type.h" +#include "clang/Sema/HeuristicResolver.h" using namespace clang; using namespace clang::index; @@ -168,51 +169,31 @@ class BodyIndexer : public RecursiveASTVisitor { Parent, ParentDC, Roles, Relations, E); } - bool indexDependentReference( - const Expr *E, const Type *T, const DeclarationNameInfo &NameInfo, - llvm::function_ref Filter) { - if (!T) - return true; - const TemplateSpecializationType *TST = - T->getAs(); - if (!TST) - return true; - TemplateName TN = TST->getTemplateName(); - const ClassTemplateDecl *TD = - dyn_cast_or_null(TN.getAsTemplateDecl()); - if (!TD) - return true; - CXXRecordDecl *RD = TD->getTemplatedDecl(); - if (!RD->hasDefinition()) - return true; - RD = RD->getDefinition(); - std::vector Symbols = - RD->lookupDependentName(NameInfo.getName(), Filter); + bool indexDependentReference(const Expr *E, SourceLocation Loc, + std::vector TargetSymbols) { // FIXME: Improve overload handling. - if (Symbols.size() != 1) + if (TargetSymbols.size() != 1) return true; - SourceLocation Loc = NameInfo.getLoc(); if (Loc.isInvalid()) Loc = E->getBeginLoc(); SmallVector Relations; SymbolRoleSet Roles = getRolesForRef(E, Relations); - return IndexCtx.handleReference(Symbols[0], Loc, Parent, ParentDC, Roles, - Relations, E); + return IndexCtx.handleReference(TargetSymbols[0], Loc, Parent, ParentDC, + Roles, Relations, E); } bool VisitCXXDependentScopeMemberExpr(CXXDependentScopeMemberExpr *E) { - const DeclarationNameInfo &Info = E->getMemberNameInfo(); - return indexDependentReference( - E, E->getBaseType().getTypePtrOrNull(), Info, - [](const NamedDecl *D) { return D->isCXXInstanceMember(); }); + auto *Resolver = IndexCtx.getResolver(); + assert(Resolver); + return indexDependentReference(E, E->getMemberNameInfo().getLoc(), + Resolver->resolveMemberExpr(E)); } bool VisitDependentScopeDeclRefExpr(DependentScopeDeclRefExpr *E) { - const DeclarationNameInfo &Info = E->getNameInfo(); - const NestedNameSpecifier *NNS = E->getQualifier(); - return indexDependentReference( - E, NNS->getAsType(), Info, - [](const NamedDecl *D) { return !D->isCXXInstanceMember(); }); + auto *Resolver = IndexCtx.getResolver(); + assert(Resolver); + return indexDependentReference(E, E->getNameInfo().getLoc(), + Resolver->resolveDeclRefExpr(E)); } bool VisitDesignatedInitExpr(DesignatedInitExpr *E) { diff --git a/clang/lib/Index/IndexingContext.cpp b/clang/lib/Index/IndexingContext.cpp index 2dd68dfcc5a70..bdd6c5acf1d34 100644 --- a/clang/lib/Index/IndexingContext.cpp +++ b/clang/lib/Index/IndexingContext.cpp @@ -14,6 +14,7 @@ #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Index/IndexDataConsumer.h" +#include "clang/Sema/HeuristicResolver.h" using namespace clang; using namespace index; @@ -25,6 +26,17 @@ static bool isGeneratedDecl(const Decl *D) { return false; } +IndexingContext::IndexingContext(IndexingOptions IndexOpts, + IndexDataConsumer &DataConsumer) + : IndexOpts(IndexOpts), DataConsumer(DataConsumer) {} + +IndexingContext::~IndexingContext() = default; + +void IndexingContext::setASTContext(ASTContext &ctx) { + Ctx = &ctx; + Resolver = Ctx ? std::make_unique(*Ctx) : nullptr; +} + bool IndexingContext::shouldIndex(const Decl *D) { return !isGeneratedDecl(D); } diff --git a/clang/lib/Index/IndexingContext.h b/clang/lib/Index/IndexingContext.h index 3020b33bea385..01bfcb9d578bc 100644 --- a/clang/lib/Index/IndexingContext.h +++ b/clang/lib/Index/IndexingContext.h @@ -21,6 +21,7 @@ namespace clang { class Decl; class DeclGroupRef; class ImportDecl; + class HeuristicResolver; class TagDecl; class TypeSourceInfo; class NamedDecl; @@ -39,15 +40,18 @@ class IndexingContext { IndexingOptions IndexOpts; IndexDataConsumer &DataConsumer; ASTContext *Ctx = nullptr; + std::unique_ptr Resolver; public: - IndexingContext(IndexingOptions IndexOpts, IndexDataConsumer &DataConsumer) - : IndexOpts(IndexOpts), DataConsumer(DataConsumer) {} + IndexingContext(IndexingOptions IndexOpts, IndexDataConsumer &DataConsumer); + ~IndexingContext(); const IndexingOptions &getIndexOpts() const { return IndexOpts; } IndexDataConsumer &getDataConsumer() { return DataConsumer; } - void setASTContext(ASTContext &ctx) { Ctx = &ctx; } + void setASTContext(ASTContext &ctx); + + HeuristicResolver *getResolver() const { return Resolver.get(); } bool shouldIndex(const Decl *D); diff --git a/clang/test/Index/Core/index-dependent-source.cpp b/clang/test/Index/Core/index-dependent-source.cpp index 8fec9abd1e926..ef414c8fdf7a0 100644 --- a/clang/test/Index/Core/index-dependent-source.cpp +++ b/clang/test/Index/Core/index-dependent-source.cpp @@ -3,7 +3,7 @@ int invalid; class Base { - void baseFunction(); + void baseFunction() const; int baseField; @@ -13,7 +13,7 @@ class Base { template class BaseTemplate { public: - T baseTemplateFunction(); + T baseTemplateFunction() const; T baseTemplateField; @@ -25,7 +25,7 @@ class TemplateClass: public Base , public BaseTemplate { public: ~TemplateClass(); - T function() { } + T function() const { } static void staticFunction() { } @@ -48,27 +48,27 @@ template void indexSimpleDependentDeclarations(const TemplateClass &object) { // Valid instance members: object.function(); -// CHECK: [[@LINE-1]]:10 | instance-method/C++ | function | c:@ST>2#T#T@TemplateClass@F@function# | | Ref,Call,RelCall,RelCont | rel: 1 +// CHECK: [[@LINE-1]]:10 | instance-method/C++ | function | c:@ST>2#T#T@TemplateClass@F@function#1 | | Ref,Call,RelCall,RelCont | rel: 1 object.field; // CHECK: [[@LINE-1]]:10 | field/C++ | field | c:@ST>2#T#T@TemplateClass@FI@field | | Ref,RelCont | rel: 1 object.baseFunction(); -// CHECK: [[@LINE-1]]:10 | instance-method/C++ | baseFunction | c:@S@Base@F@baseFunction# | __ZN4Base12baseFunctionEv | Ref,Call,RelCall,RelCont | rel: 1 +// CHECK: [[@LINE-1]]:10 | instance-method/C++ | baseFunction | c:@S@Base@F@baseFunction#1 | __ZNK4Base12baseFunctionEv | Ref,Call,RelCall,RelCont | rel: 1 object.baseField; // CHECK: [[@LINE-1]]:10 | field/C++ | baseField | c:@S@Base@FI@baseField | | Ref,RelCont | rel: 1 object.baseTemplateFunction(); -// CHECK: [[@LINE-1]]:10 | instance-method/C++ | baseTemplateFunction | c:@ST>1#T@BaseTemplate@F@baseTemplateFunction# | | Ref,Call,RelCall,RelCont | rel: 1 +// CHECK: [[@LINE-1]]:10 | instance-method/C++ | baseTemplateFunction | c:@ST>1#T@BaseTemplate@F@baseTemplateFunction#1 | | Ref,Call,RelCall,RelCont | rel: 1 object.baseTemplateField; // CHECK: [[@LINE-1]]:10 | field/C++ | baseTemplateField | c:@ST>1#T@BaseTemplate@FI@baseTemplateField | | Ref,RelCont | rel: 1 - // Invalid instance members: + // Static members (these are still valid to access via an instance): object.variable; -// CHECK-NOT: [[@LINE-1]]:10 +// CHECK: [[@LINE-1]]:10 | static-property/C++ | variable | c:@ST>2#T#T@TemplateClass@variable | __ZN13TemplateClass8variableE | Ref,RelCont | rel: 1 object.staticFunction(); -// CHECK-NOT: [[@LINE-1]]:10 +// CHECK: [[@LINE-1]]:10 | static-method/C++ | staticFunction | c:@ST>2#T#T@TemplateClass@F@staticFunction#S | | Ref,Call,RelCall,RelCont | rel: 1 object.Struct; -// CHECK-NOT: [[@LINE-1]]:10 +// CHECK: [[@LINE-1]]:10 | struct/C | Struct | c:@ST>2#T#T@TemplateClass@S@Struct | | Ref,RelCont | rel: 1 object.EnumValue; -// CHECK-NOT: [[@LINE-1]]:10 +// CHECK: [[@LINE-1]]:10 | enumerator/C | EnumValue | c:@ST>2#T#T@TemplateClass@E@Enum@EnumValue | | Ref,RelCont | rel: 1 // Valid static members: TemplateClass::staticFunction(); From a5e6ccf546932118cbbab6633f5d599914fd75ec Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 18 Feb 2025 07:13:59 +0100 Subject: [PATCH 078/127] [mlir][bazel] Port https://github.com/llvm/llvm-project/commit/517800e37e8d3a4ee84214bef65e227612c2a98b (#127544) Introduces a `LinalgInterfaces` target so that `TensorDialect` doesn't need to depend on `LinalgDialect`, which would introduce a circular dependency. --- .../Dialect/Linalg/IR/LinalgInterfaces.cpp | 2 - mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 2 +- .../llvm-project-overlay/mlir/BUILD.bazel | 87 ++++++++++++++----- .../mlir/test/BUILD.bazel | 1 + 4 files changed, 67 insertions(+), 25 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp index 466a9799295f9..c16c38ea22a5d 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp @@ -13,8 +13,6 @@ #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index e741144647043..72e9d4d9c64d9 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -10,7 +10,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Complex/IR/Complex.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/Dialect/Utils/IndexingUtils.h" diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 92aedac837197..60f66b154e313 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -7763,7 +7763,6 @@ td_library( name = "TensorOpsTdFiles", srcs = [ "include/mlir/Dialect/Tensor/IR/TensorBase.td", - "include/mlir/Dialect/Tensor/IR/TensorInterfaces.td", "include/mlir/Dialect/Tensor/IR/TensorOps.td", ], includes = ["include"], @@ -7813,23 +7812,6 @@ gentbl_cc_library( deps = [":TensorOpsTdFiles"], ) -gentbl_cc_library( - name = "TensorInterfacesIncGen", - tbl_outs = [ - ( - ["--gen-op-interface-decls"], - "include/mlir/Dialect/Tensor/IR/TensorInterfaces.h.inc", - ), - ( - ["--gen-op-interface-defs"], - "include/mlir/Dialect/Tensor/IR/TensorInterfaces.cpp.inc", - ), - ], - tblgen = ":mlir-tblgen", - td_file = "include/mlir/Dialect/Tensor/IR/TensorInterfaces.td", - deps = [":TensorOpsTdFiles"], -) - cc_library( name = "TensorDialect", srcs = [ @@ -7859,13 +7841,13 @@ cc_library( ":InferIntRangeInterface", ":InferTypeOpInterface", ":InliningUtils", + ":LinalgInterfaces", ":LoopLikeInterface", ":ParallelCombiningOpInterface", ":ShapedOpInterfaces", ":SideEffectInterfaces", ":SubsetOpInterface", ":Support", - ":TensorInterfacesIncGen", ":TensorOpsIncGen", ":TilingInterface", ":TransformDialectInterfaces", @@ -11206,6 +11188,23 @@ gentbl_cc_library( deps = [":LinalgOpsTdFiles"], ) +gentbl_cc_library( + name = "LinalgRelayoutOpsIncGen", + tbl_outs = [ + ( + ["-gen-op-decls"], + "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.h.inc", + ), + ( + ["-gen-op-defs"], + "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.cpp.inc", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td", + deps = [":LinalgOpsTdFiles"], +) + gentbl_cc_library( name = "LinalgEnumsIncGen", tbl_outs = [ @@ -11532,10 +11531,50 @@ cc_library( ], ) +cc_library( + name = "LinalgInterfaces", + srcs = [ + "include/mlir/Dialect/Linalg/IR/Linalg.h", + "lib/Dialect/Linalg/IR/LinalgInterfaces.cpp", + ], + hdrs = ["include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h"], + includes = ["include"], + deps = [ + ":AffineDialect", + ":ArithDialect", + ":ArithUtils", + ":BytecodeOpInterface", + ":ComplexDialect", + ":ControlFlowInterfaces", + ":CopyOpInterface", + ":DestinationStyleOpInterface", + ":DialectUtils", + ":IR", + ":InferTypeOpInterface", + ":LinalgEnumsIncGen", + ":LinalgInterfacesIncGen", + ":LinalgOpsIncGen", + ":LinalgRelayoutOpsIncGen", + ":LinalgStructuredOpsIncGen", + ":SideEffectInterfaces", + ":Support", + ":TilingInterface", + ":ViewLikeInterface", + "//third_party/llvm/llvm-project/llvm:Support", + ], +) + cc_library( name = "LinalgDialect", - srcs = glob(["lib/Dialect/Linalg/IR/*.cpp"]), - hdrs = glob(["include/mlir/Dialect/Linalg/IR/*.h"]), + srcs = [ + "lib/Dialect/Linalg/IR/LinalgDialect.cpp", + "lib/Dialect/Linalg/IR/LinalgOps.cpp", + "lib/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.cpp", + ], + hdrs = [ + "include/mlir/Dialect/Linalg/IR/Linalg.h", + "include/mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h", + ], includes = ["include"], deps = [ ":AffineDialect", @@ -11554,9 +11593,10 @@ cc_library( ":InferTypeOpInterface", ":InliningUtils", ":LinalgEnumsIncGen", - ":LinalgInterfacesIncGen", + ":LinalgInterfaces", ":LinalgNamedStructuredOpsYamlIncGen", ":LinalgOpsIncGen", + ":LinalgRelayoutOpsIncGen", ":LinalgStructuredOpsIncGen", ":MathDialect", ":MemRefDialect", @@ -11568,6 +11608,7 @@ cc_library( ":SubsetOpInterface", ":Support", ":TensorDialect", + ":TensorUtils", ":TilingInterface", ":ValueBoundsOpInterface", ":ViewLikeInterface", @@ -11599,6 +11640,7 @@ cc_library( ":IR", ":IndexDialect", ":LinalgDialect", + ":LinalgInterfaces", ":LinalgMatchOpsIncGen", ":LinalgTransformEnumsIncGen", ":LinalgTransformOpsIncGen", @@ -11710,6 +11752,7 @@ cc_library( ":IR", ":IndexDialect", ":LinalgDialect", + ":LinalgInterfaces", ":LinalgPassIncGen", ":LinalgStructuredOpsIncGen", ":LinalgUtils", diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index 3e6114abfc078..9b005b206a101 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -425,6 +425,7 @@ cc_library( "//mlir:LLVMDialect", "//mlir:LLVMIRToLLVMTranslation", "//mlir:LinalgDialect", + "//mlir:LinalgInterfaces", "//mlir:LoopLikeInterface", "//mlir:MemorySlotInterfaces", "//mlir:Pass", From 6ba34f9e7374109e2d2119b5de3c196aa928f179 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 17 Feb 2025 22:14:22 -0800 Subject: [PATCH 079/127] [RISCV] Use policy instead of ForceTailAgnostic for vmsbf/vmsif/vmsof pseudos. (#127535) ForceTailAgnostic is currently treated as an override of the policy operand. It doesn't do anything else so we can just use the policy directly. --- llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 77f41e3c202c7..33c04d1c05613 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -122,6 +122,7 @@ def DecImm : SDNodeXForm, SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx, forcePassthruRead=true>; - let ForceTailAgnostic = true in def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask, SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx, @@ -4019,7 +4019,7 @@ class VPatMaskUnaryMask(inst#"_M_"#mti.BX#"_MASK") (mti.Mask VR:$passthru), (mti.Mask VR:$rs2), - (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW, TU_MU)>; + (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW, TA_MU)>; class VPatUnaryAnyMask Date: Tue, 18 Feb 2025 14:24:20 +0800 Subject: [PATCH 080/127] [RISCV] Implement isHighLatencyDef() (#127476) And returns true for div/rem/sqrt/... operations. This is an alternative if we don't support generic scheduling model. --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 50 +++++++++++ llvm/lib/Target/RISCV/RISCVInstrInfo.h | 2 + .../CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll | 22 ++--- .../CodeGen/RISCV/rvv/sink-splat-operands.ll | 12 +-- .../RISCV/rvv/vfdiv-constrained-sdnode.ll | 88 ++++++++++++------- llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll | 76 +++++----------- .../RISCV/rvv/vfsqrt-constrained-sdnode.ll | 16 ++-- llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll | 16 ++-- 8 files changed, 161 insertions(+), 121 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 1ec299e3c8cc0..456fb66917216 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -4336,3 +4336,53 @@ RISCVInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { return std::make_unique(LHS, RHS, Cond); } + +// FIXME: We should remove this if we have a default generic scheduling model. +bool RISCVInstrInfo::isHighLatencyDef(int Opc) const { + unsigned RVVMCOpcode = RISCV::getRVVMCOpcode(Opc); + Opc = RVVMCOpcode ? RVVMCOpcode : Opc; + switch (Opc) { + default: + return false; + // Integer div/rem. + case RISCV::DIV: + case RISCV::DIVW: + case RISCV::DIVU: + case RISCV::DIVUW: + case RISCV::REM: + case RISCV::REMW: + case RISCV::REMU: + case RISCV::REMUW: + // Floating-point div/sqrt. + case RISCV::FDIV_H: + case RISCV::FDIV_S: + case RISCV::FDIV_D: + case RISCV::FDIV_H_INX: + case RISCV::FDIV_S_INX: + case RISCV::FDIV_D_INX: + case RISCV::FDIV_D_IN32X: + case RISCV::FSQRT_H: + case RISCV::FSQRT_S: + case RISCV::FSQRT_D: + case RISCV::FSQRT_H_INX: + case RISCV::FSQRT_S_INX: + case RISCV::FSQRT_D_INX: + case RISCV::FSQRT_D_IN32X: + // Vector integer div/rem + case RISCV::VDIV_VV: + case RISCV::VDIV_VX: + case RISCV::VDIVU_VV: + case RISCV::VDIVU_VX: + case RISCV::VREM_VV: + case RISCV::VREM_VX: + case RISCV::VREMU_VV: + case RISCV::VREMU_VX: + // Vector floating-point div/sqrt. + case RISCV::VFDIV_VV: + case RISCV::VFDIV_VF: + case RISCV::VFRDIV_VF: + case RISCV::VFSQRT_V: + case RISCV::VFRSQRT7_V: + return true; + } +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index ec628620d2982..afbc8df50b452 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -300,6 +300,8 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { std::unique_ptr analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override; + bool isHighLatencyDef(int Opc) const override; + protected: const RISCVSubtarget &STI; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index eb7be14abe431..0d1d75c1b2a75 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -894,18 +894,18 @@ define <2 x i16> @vwmul_v2i16_multiuse(ptr %x, ptr %y, ptr %z, ptr %w) { ; CHECK-LABEL: vwmul_v2i16_multiuse: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vle8.v v10, (a2) -; CHECK-NEXT: vle8.v v11, (a3) -; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: vsext.vf2 v10, v8 ; CHECK-NEXT: vsext.vf2 v8, v9 -; CHECK-NEXT: vsext.vf2 v9, v10 -; CHECK-NEXT: vsext.vf2 v10, v11 -; CHECK-NEXT: vmul.vv v11, v12, v10 -; CHECK-NEXT: vmul.vv v10, v8, v10 -; CHECK-NEXT: vdivu.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v9, v11, v10 +; CHECK-NEXT: vdivu.vv v8, v10, v8 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v11, (a3) +; CHECK-NEXT: vsext.vf2 v12, v9 +; CHECK-NEXT: vsext.vf2 v9, v11 +; CHECK-NEXT: vmul.vv v11, v12, v9 +; CHECK-NEXT: vmul.vv v9, v10, v9 +; CHECK-NEXT: vor.vv v9, v11, v9 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index 1948675ae9cf0..c46334fe556eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -1564,8 +1564,8 @@ define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: .LBB27_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfdiv.vf v8, v8, fa0 +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB27_3 @@ -1654,8 +1654,8 @@ define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: .LBB28_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfrdiv.vf v8, v8, fa0 +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB28_3 @@ -2504,8 +2504,8 @@ define void @sink_splat_udiv_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB42_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vdivu.vx v8, v8, a1 +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vs2r.v v8, (a6) ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB42_3 @@ -2595,8 +2595,8 @@ define void @sink_splat_sdiv_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB43_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vdiv.vx v8, v8, a1 +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vs2r.v v8, (a6) ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB43_3 @@ -2686,8 +2686,8 @@ define void @sink_splat_urem_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB44_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vremu.vx v8, v8, a1 +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vs2r.v v8, (a6) ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB44_3 @@ -2777,8 +2777,8 @@ define void @sink_splat_srem_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB45_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vrem.vx v8, v8, a1 +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vs2r.v v8, (a6) ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB45_3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll index 07750623dd44b..217a02d08dead 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll @@ -221,16 +221,16 @@ define @vfdiv_vv_nxv32bf16( %va, @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmv.v.x v16, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v0, v8, v0 +; CHECK-NEXT: vfdiv.vv v24, v16, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v24, v16 +; CHECK-NEXT: vfdiv.vv v16, v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -573,16 +583,16 @@ define @vfdiv_vv_nxv32f16( %va, @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: vmv.v.x v16, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v0, v8, v0 +; ZVFHMIN-NEXT: vfdiv.vv v24, v16, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll index e671ba850415b..9aba6455f0fac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll @@ -200,16 +200,16 @@ define @vfdiv_vv_nxv32bf16( %va, @vfdiv_vv_nxv32bf16( %va, @vfdiv_vf_nxv32bf16( %va, bfloat %b) { ; CHECK-LABEL: vfdiv_vf_nxv32bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v0, v8, v0 +; CHECK-NEXT: vfdiv.vv v16, v16, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v24, v16 +; CHECK-NEXT: vfdiv.vv v24, v24, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -528,16 +512,16 @@ define @vfdiv_vv_nxv32f16( %va, @vfdiv_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v0, v8, v0 +; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfdiv.vv v24, v24, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll index d5e65e2c8fd3f..eeb5f3bc984d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll @@ -92,15 +92,15 @@ define @vfsqrt_nxv32bf16( %v) stric ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfsqrt.v v16, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsqrt.v v16, v24 +; CHECK-NEXT: vfsqrt.v v24, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.sqrt.nxv32bf16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -229,15 +229,15 @@ define @vfsqrt_nxv32f16( %v) strictfp { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v16, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsqrt.v v16, v24 +; ZVFHMIN-NEXT: vfsqrt.v v24, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %r = call @llvm.experimental.constrained.sqrt.nxv32f16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll index 4d761981aac97..6d7662db2b157 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll @@ -87,15 +87,15 @@ define @vfsqrt_nxv32bf16( %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfsqrt.v v16, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsqrt.v v16, v24 +; CHECK-NEXT: vfsqrt.v v24, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %r = call @llvm.sqrt.nxv32bf16( %v) ret %r @@ -224,15 +224,15 @@ define @vfsqrt_nxv32f16( %v) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v16, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsqrt.v v16, v24 +; ZVFHMIN-NEXT: vfsqrt.v v24, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %r = call @llvm.sqrt.nxv32f16( %v) ret %r From 6e532700f86f56b51506cc5a733f4f21fd03ab03 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 18 Feb 2025 07:26:10 +0100 Subject: [PATCH 081/127] [clang][bazel] Port https://github.com/llvm/llvm-project/commit/d09cce166de9fc4fa243bdb4a2ea22df08110abd --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 7fd7c8b438629..2aced96c112ef 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1048,6 +1048,7 @@ cc_library( ":frontend", ":lex", ":rewrite", + ":sema", ":serialization", "//llvm:Core", "//llvm:Support", From ab8d99530d4e80b619c19681781eac5f545b2c38 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 18 Feb 2025 07:38:23 +0100 Subject: [PATCH 082/127] [mlir][bazel] Fix after https://github.com/llvm/llvm-project/pull/127544 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 60f66b154e313..53aca8ab042ad 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -11560,7 +11560,7 @@ cc_library( ":Support", ":TilingInterface", ":ViewLikeInterface", - "//third_party/llvm/llvm-project/llvm:Support", + "//llvm:Support", ], ) From 77410f2a25529c9675853cf30c76168cccfe0f5d Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 18 Feb 2025 07:50:23 +0100 Subject: [PATCH 083/127] [mlir][tensor] Remove unnecessary include. This include introduced an unwanted dependency from tensor to tensor utils. --- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 72e9d4d9c64d9..fad7db48b9872 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -12,7 +12,6 @@ #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Dialect/Tensor/Utils/Utils.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/ReshapeOpsUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" From 251377c47d296000158347d6a1cba2e6b30132a3 Mon Sep 17 00:00:00 2001 From: Matthew Devereau Date: Tue, 18 Feb 2025 07:06:56 +0000 Subject: [PATCH 084/127] [InstCombine] Fold shift+cttz with power of 2 operands (#127055) #121386 Introduced cttz intrinsics which caused a regression where vscale/vscale divisions could no longer be constant folded. This fold was suggested as a fix in #126411. https://alive2.llvm.org/ce/z/gWbtPw --- .../InstCombine/InstCombineShifts.cpp | 16 ++++++++++ .../Transforms/InstCombine/shift-cttz-ctlz.ll | 30 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 7ef95800975db..90cd279e8a457 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -1613,6 +1613,22 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { if (Instruction *Overflow = foldLShrOverflowBit(I)) return Overflow; + // Transform ((pow2 << x) >> cttz(pow2 << y)) -> ((1 << x) >> y) + Value *Shl0_Op0, *Shl0_Op1, *Shl1_Op1; + BinaryOperator *Shl1; + if (match(Op0, m_Shl(m_Value(Shl0_Op0), m_Value(Shl0_Op1))) && + match(Op1, m_Intrinsic(m_BinOp(Shl1))) && + match(Shl1, m_Shl(m_Specific(Shl0_Op0), m_Value(Shl1_Op1))) && + isKnownToBeAPowerOfTwo(Shl0_Op0, /*OrZero=*/true, 0, &I)) { + auto *Shl0 = cast(Op0); + bool HasNUW = Shl0->hasNoUnsignedWrap() && Shl1->hasNoUnsignedWrap(); + bool HasNSW = Shl0->hasNoSignedWrap() && Shl1->hasNoSignedWrap(); + if (HasNUW || HasNSW) { + Value *NewShl = Builder.CreateShl(ConstantInt::get(Shl1->getType(), 1), + Shl0_Op1, "", HasNUW, HasNSW); + return BinaryOperator::CreateLShr(NewShl, Shl1_Op1); + } + } return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll index 63caec9501325..e82e33e9d7f04 100644 --- a/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll +++ b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll @@ -103,4 +103,34 @@ entry: ret i32 %res } +define i64 @fold_cttz_64() vscale_range(1,16) { +; CHECK-LABEL: define i64 @fold_cttz_64( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i64 4 +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %shl0 = shl nuw nsw i64 %vscale, 4 + %shl1 = shl nuw nsw i64 %vscale, 2 + %cttz = tail call range(i64 2, 65) i64 @llvm.cttz.i64(i64 %shl1, i1 true) + %div1 = lshr i64 %shl0, %cttz + ret i64 %div1 +} + +define i32 @fold_cttz_32() vscale_range(1,16) { +; CHECK-LABEL: define i32 @fold_cttz_32( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 4 +; +entry: + %vscale = tail call i32 @llvm.vscale.i32() + %shl0 = shl nuw nsw i32 %vscale, 4 + %shl1 = shl nuw nsw i32 %vscale, 2 + %cttz = tail call range(i32 2, 65) i32 @llvm.cttz.i32(i32 %shl1, i1 true) + %div1 = lshr i32 %shl0, %cttz + ret i32 %div1 +} + declare void @use(i32) From b227c2578cc77e2113846b270d0b0a08f53e8019 Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Tue, 18 Feb 2025 15:49:58 +0800 Subject: [PATCH 085/127] [mlir][scf] add unroll-full option to test-loop-unrolling pass (#127158) Some loops cannot be unrolled by affine-loop-unroll pass. After running lower-affine pass, they can be unrolled in scf.To enable conversion of vector Ops in scf to llvm dialect, unroll-full option was added. --------- Co-authored-by: Oleksandr "Alex" Zinenko --- mlir/include/mlir/Dialect/SCF/Utils/Utils.h | 3 + mlir/lib/Dialect/SCF/Utils/Utils.cpp | 14 +++++ mlir/test/Transforms/scf-loop-unroll.mlir | 57 +++++++++++++++++++ .../lib/Dialect/SCF/TestLoopUnrolling.cpp | 14 ++++- 4 files changed, 85 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h index 02ffa0da7a8b8..c0c11c9e38994 100644 --- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h @@ -126,6 +126,9 @@ FailureOr loopUnrollByFactor( scf::ForOp forOp, uint64_t unrollFactor, function_ref annotateFn = nullptr); +/// Unrolls this loop completely. +LogicalResult loopUnrollFull(scf::ForOp forOp); + /// Unrolls and jams this `scf.for` operation by the specified unroll factor. /// Returns failure if the loop cannot be unrolled either due to restrictions or /// due to invalid unroll factors. In case of unroll factor of 1, the function diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index fa82bcb816a2a..bc1cb24303ad2 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -498,6 +498,20 @@ FailureOr mlir::loopUnrollByFactor( return resultLoops; } +/// Unrolls this loop completely. +LogicalResult mlir::loopUnrollFull(scf::ForOp forOp) { + IRRewriter rewriter(forOp.getContext()); + std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + if (!mayBeConstantTripCount.has_value()) + return failure(); + uint64_t tripCount = *mayBeConstantTripCount; + if (tripCount == 0) + return success(); + if (tripCount == 1) + return forOp.promoteIfSingleIteration(rewriter); + return loopUnrollByFactor(forOp, tripCount); +} + /// Check if bounds of all inner loops are defined outside of `forOp` /// and return false if not. static bool areInnerBoundsInvariant(scf::ForOp forOp) { diff --git a/mlir/test/Transforms/scf-loop-unroll.mlir b/mlir/test/Transforms/scf-loop-unroll.mlir index baf6b2970ac0e..0ef6ad15d4eb0 100644 --- a/mlir/test/Transforms/scf-loop-unroll.mlir +++ b/mlir/test/Transforms/scf-loop-unroll.mlir @@ -1,5 +1,6 @@ // RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=3" -split-input-file -canonicalize | FileCheck %s // RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=1" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-BY-1 +// RUN: mlir-opt %s --test-loop-unrolling="unroll-full=true" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-FULL // CHECK-LABEL: scf_loop_unroll_single func.func @scf_loop_unroll_single(%arg0 : f32, %arg1 : f32) -> f32 { @@ -56,3 +57,59 @@ func.func @scf_loop_unroll_factor_1_promote() -> () { // UNROLL-BY-1-NEXT: %[[C0:.*]] = arith.constant 0 : index // UNROLL-BY-1-NEXT: %{{.*}} = "test.foo"(%[[C0]]) : (index) -> i32 } + +// UNROLL-FULL-LABEL: func @scf_loop_unroll_full_single +// UNROLL-FULL-SAME: %[[ARG:.*]]: index) +func.func @scf_loop_unroll_full_single(%arg : index) -> index { + %0 = arith.constant 0 : index + %1 = arith.constant 1 : index + %2 = arith.constant 4 : index + %4 = scf.for %iv = %0 to %2 step %1 iter_args(%arg1 = %1) -> index { + %3 = arith.addi %arg1, %arg : index + scf.yield %3 : index + } + return %4 : index + // UNROLL-FULL: %[[C1:.*]] = arith.constant 1 : index + // UNROLL-FULL: %[[V0:.*]] = arith.addi %[[ARG]], %[[C1]] : index + // UNROLL-FULL: %[[V1:.*]] = arith.addi %[[V0]], %[[ARG]] : index + // UNROLL-FULL: %[[V2:.*]] = arith.addi %[[V1]], %[[ARG]] : index + // UNROLL-FULL: %[[V3:.*]] = arith.addi %[[V2]], %[[ARG]] : index + // UNROLL-FULL: return %[[V3]] : index +} + +// UNROLL-FULL-LABEL: func @scf_loop_unroll_full_outter_loops +// UNROLL-FULL-SAME: %[[ARG:.*]]: vector<4x4xindex>) +func.func @scf_loop_unroll_full_outter_loops(%arg0: vector<4x4xindex>) -> index { + %0 = arith.constant 0 : index + %1 = arith.constant 1 : index + %2 = arith.constant 4 : index + %6 = scf.for %arg1 = %0 to %2 step %1 iter_args(%it0 = %0) -> index { + %5 = scf.for %arg2 = %0 to %2 step %1 iter_args(%it1 = %it0) -> index { + %3 = vector.extract %arg0[%arg1, %arg2] : index from vector<4x4xindex> + %4 = arith.addi %3, %it1 : index + scf.yield %3 : index + } + scf.yield %5 : index + } + return %6 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index + // UNROLL-FULL: %[[C1:.*]] = arith.constant 1 : index + // UNROLL-FULL: %[[C4:.*]] = arith.constant 4 : index + // UNROLL-FULL: %[[SUM0:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%{{.*}} = %[[C0]]) + // UNROLL-FULL: %[[VAL:.*]] = vector.extract %[[ARG]][0, %[[IV]]] : index from vector<4x4xindex> + // UNROLL-FULL: scf.yield %[[VAL]] : index + // UNROLL-FULL: } + // UNROLL-FULL: %[[SUM1:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%{{.*}} = %[[SUM0]]) + // UNROLL-FULL: %[[VAL:.*]] = vector.extract %[[ARG]][1, %[[IV]]] : index from vector<4x4xindex> + // UNROLL-FULL: scf.yield %[[VAL]] : index + // UNROLL-FULL: } + // UNROLL-FULL: %[[SUM2:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%{{.*}} = %[[SUM1]]) + // UNROLL-FULL: %[[VAL:.*]] = vector.extract %[[ARG]][2, %[[IV]]] : index from vector<4x4xindex> + // UNROLL-FULL: scf.yield %[[VAL]] : index + // UNROLL-FULL: } + // UNROLL-FULL: %[[SUM3:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%{{.*}} = %[[SUM2]]) + // UNROLL-FULL: %[[VAL:.*]] = vector.extract %[[ARG]][3, %[[IV]]] : index from vector<4x4xindex> + // UNROLL-FULL: scf.yield %[[VAL]] : index + // UNROLL-FULL: } + // UNROLL-FULL: return %[[SUM3]] : index +} diff --git a/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp index 8694a7f9bbd62..ced003305a7b8 100644 --- a/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp +++ b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp @@ -42,10 +42,11 @@ struct TestLoopUnrollingPass TestLoopUnrollingPass(const TestLoopUnrollingPass &) {} explicit TestLoopUnrollingPass(uint64_t unrollFactorParam, unsigned loopDepthParam, - bool annotateLoopParam) { + bool annotateLoopParam, bool unrollFullParam) { unrollFactor = unrollFactorParam; loopDepth = loopDepthParam; annotateLoop = annotateLoopParam; + unrollFull = unrollFactorParam; } void getDependentDialects(DialectRegistry ®istry) const override { @@ -63,8 +64,12 @@ struct TestLoopUnrollingPass op->setAttr("unrolled_iteration", b.getUI32IntegerAttr(i)); } }; - for (auto loop : loops) - (void)loopUnrollByFactor(loop, unrollFactor, annotateFn); + for (auto loop : loops) { + if (unrollFull) + (void)loopUnrollFull(loop); + else + (void)loopUnrollByFactor(loop, unrollFactor, annotateFn); + } } Option unrollFactor{*this, "unroll-factor", llvm::cl::desc("Loop unroll factor."), @@ -77,6 +82,9 @@ struct TestLoopUnrollingPass llvm::cl::init(false)}; Option loopDepth{*this, "loop-depth", llvm::cl::desc("Loop depth."), llvm::cl::init(0)}; + Option unrollFull{*this, "unroll-full", + llvm::cl::desc("Full unroll loops."), + llvm::cl::init(false)}; }; } // namespace From d49c3181e68de20ebd5f8e9fc5f4ad3ce20cad6d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 18 Feb 2025 08:08:24 +0000 Subject: [PATCH 086/127] [X86] emitEpilogue - silence implicit integer extension warning. (#127185) Silence compiler warning introduced in #125007 - assign the address delta to int64_t, assert it is negative and negate it only as part of the mergeSPAdd call Fixes #125825 --- llvm/lib/Target/X86/X86FrameLowering.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 50c56c9dd08b3..10fb6994b51b6 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2639,11 +2639,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. - int64_t Offset = -1 * X86FI->getTCReturnAddrDelta(); - assert(Offset >= 0 && "TCDelta should never be positive"); - if (Offset) { + int64_t Delta = X86FI->getTCReturnAddrDelta(); + assert(Delta <= 0 && "TCDelta should never be positive"); + if (Delta) { // Check for possible merge with preceding ADD instruction. - Offset = mergeSPAdd(MBB, Terminator, Offset, true); + int64_t Offset = mergeSPAdd(MBB, Terminator, -Delta, true); emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true); } } From 1ae9dd31a21022d360e5f1efa32a71c4073c7e18 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 18 Feb 2025 08:08:58 +0000 Subject: [PATCH 087/127] [X86] combineConcatVectorOps - add EXTEND_VECTOR_INREG() 512-bit handling (#127530) Support extension to 512-bit vectors on AVX512/BWI targets. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +- .../vector-interleaved-store-i8-stride-8.ll | 1876 ++++++++--------- 2 files changed, 890 insertions(+), 992 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6ed69dbd6dae0..386d56dcda9de 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57883,8 +57883,10 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: { // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND. - if (!IsSplat && NumOps == 2 && VT.is256BitVector() && - Subtarget.hasInt256() && + if (!IsSplat && NumOps == 2 && + ((VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.useAVX512Regs() && + (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) && Op0.getOperand(0).getValueType().is128BitVector() && Op0.getOperand(0).getValueType() == Ops[0].getOperand(0).getValueType()) { diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 6fee9377d261a..ba51c65ccab13 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -6721,7 +6721,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6729,11 +6729,11 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm6 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%r10), %xmm5 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa 48(%r10), %xmm3 @@ -6742,8 +6742,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa 48(%rax), %xmm4 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] @@ -6751,18 +6751,19 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%r9), %xmm5 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%r9), %xmm7 +; AVX512-NEXT: vmovdqa 48(%r9), %xmm6 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%r8), %xmm12 +; AVX512-NEXT: vmovdqa 48(%r8), %xmm8 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm8 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] @@ -6770,80 +6771,80 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm9 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm5, %ymm5 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm13, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm9, %ymm26 ; AVX512-NEXT: vmovdqa 32(%r10), %xmm2 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm10 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm28 ; AVX512-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm13, %ymm28 +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm9, %ymm29 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm30 +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm25 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm27 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm23 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm26 +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm19 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm22 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm17 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm20 +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm16 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm18 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm14 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm3, %ymm25 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm3, %ymm20 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm21 +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm18 ; AVX512-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX512-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm17 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm4 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm15 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm13 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm12[8],xmm5[9],xmm12[9],xmm5[10],xmm12[10],xmm5[11],xmm12[11],xmm5[12],xmm12[12],xmm5[13],xmm12[13],xmm5[14],xmm12[14],xmm5[15],xmm12[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm4 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm4 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] @@ -6863,273 +6864,244 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 16(%rcx), %xmm11 -; AVX512-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; AVX512-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX512-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 16(%r10), %xmm1 -; AVX512-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX512-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm1, %ymm30 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm29 -; AVX512-NEXT: vmovdqa 16(%r9), %xmm14 -; AVX512-NEXT: vmovdqa 16(%r8), %xmm12 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm13, %ymm1, %ymm31 -; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm31 +; AVX512-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm11, %ymm1, %ymm27 ; AVX512-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm6 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm13 & (zmm6 ^ zmm0)) +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; AVX512-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512-NEXT: vmovdqa64 %xmm7, %xmm21 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm22 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero,xmm11[4],zero,zero,zero,xmm11[5],zero,zero,zero,xmm11[6],zero,zero,zero,xmm11[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512-NEXT: vpandnq %zmm0, %zmm8, %zmm0 -; AVX512-NEXT: vpandq %zmm8, %zmm3, %zmm3 +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm10 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512-NEXT: vpandq %zmm7, %zmm10, %zmm10 ; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpord %zmm0, %zmm3, %zmm6 {%k1} +; AVX512-NEXT: vpord %zmm0, %zmm10, %zmm22 {%k1} ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm0 ^ (zmm13 & (zmm9 ^ zmm0)) +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpandnq %zmm0, %zmm8, %zmm0 -; AVX512-NEXT: vpandq %zmm8, %zmm2, %zmm2 -; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm10 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm13 & (zmm10 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm10, %zmm10 +; AVX512-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512-NEXT: vpandq %zmm7, %zmm10, %zmm10 +; AVX512-NEXT: vpord %zmm0, %zmm10, %zmm24 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm8, %zmm0 -; AVX512-NEXT: vpandq %zmm8, %zmm1, %zmm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm1 +; AVX512-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512-NEXT: vpandq %zmm7, %zmm1, %zmm1 ; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm10 {%k1} -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm27 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm18 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm17 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm19 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm13 & (zmm4 ^ zmm2)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512-NEXT: vpandnq %zmm2, %zmm8, %zmm2 -; AVX512-NEXT: vpandq %zmm8, %zmm5, %zmm5 -; AVX512-NEXT: vpord %zmm2, %zmm5, %zmm4 {%k1} -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm2, %ymm17 -; AVX512-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm3 +; AVX512-NEXT: vpandnq %zmm1, %zmm7, %zmm1 +; AVX512-NEXT: vpandq %zmm7, %zmm3, %zmm3 +; AVX512-NEXT: vpord %zmm1, %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm20 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm3, %ymm21 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm8, %ymm3, %ymm23 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm3, %ymm22 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm9 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm2, %ymm30 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm12, %ymm14 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm6, %ymm13 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm12, %ymm15 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm19 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm15 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm6, %ymm16 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm20 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm1, %ymm18 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm12 -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512-NEXT: # ymm23 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512-NEXT: # ymm24 = mem[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm24 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm24, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm23 ^ (zmm13 & (zmm5 ^ zmm23)) -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm24 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX512-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512-NEXT: # ymm25 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload ; AVX512-NEXT: # ymm26 = mem[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm25 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm26 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; AVX512-NEXT: vpandnq %zmm23, %zmm8, %zmm23 -; AVX512-NEXT: vpandq %zmm8, %zmm25, %zmm25 -; AVX512-NEXT: vpord %zmm23, %zmm25, %zmm5 {%k1} -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm23 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512-NEXT: # ymm25 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload -; AVX512-NEXT: # ymm28 = mem[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm25, %zmm25 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm24, %zmm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm25 ^ (zmm13 & (zmm11 ^ zmm25)) -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512-NEXT: # ymm24 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm29[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm24 -; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm27[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm25, %zmm16 -; AVX512-NEXT: vpandnq %zmm24, %zmm8, %zmm24 -; AVX512-NEXT: vpandq %zmm8, %zmm16, %zmm16 -; AVX512-NEXT: vpord %zmm24, %zmm16, %zmm11 {%k1} -; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm18[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm26, %zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm16 ^ (zmm13 & (zmm7 ^ zmm16)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm21[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm22[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpandnq %zmm14, %zmm7, %zmm14 +; AVX512-NEXT: vpandq %zmm7, %zmm25, %zmm25 +; AVX512-NEXT: vpord %zmm14, %zmm25, %zmm0 {%k1} +; AVX512-NEXT: vpshufd $212, (%rsp), %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512-NEXT: vpshufd {{.*#+}} ymm21 = ymm27[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm17 +; AVX512-NEXT: vpandnq %zmm14, %zmm7, %zmm14 +; AVX512-NEXT: vpandq %zmm7, %zmm17, %zmm17 +; AVX512-NEXT: vpord %zmm14, %zmm17, %zmm8 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpandnq %zmm16, %zmm8, %zmm3 -; AVX512-NEXT: vpandq %zmm8, %zmm2, %zmm2 -; AVX512-NEXT: vpord %zmm3, %zmm2, %zmm7 {%k1} -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpandnq %zmm9, %zmm7, %zmm3 +; AVX512-NEXT: vpandq %zmm7, %zmm2, %zmm2 +; AVX512-NEXT: vpord %zmm3, %zmm2, %zmm6 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm23, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm13 & (zmm3 ^ zmm2)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandnq %zmm2, %zmm8, %zmm1 -; AVX512-NEXT: vpandq %zmm8, %zmm0, %zmm0 -; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpandnq %zmm2, %zmm7, %zmm2 +; AVX512-NEXT: vpandq %zmm7, %zmm1, %zmm1 +; AVX512-NEXT: vpord %zmm2, %zmm1, %zmm12 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rax) ; AVX512-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -7433,7 +7405,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm11 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7441,11 +7413,11 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm6 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r10), %xmm5 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 48(%r10), %xmm3 @@ -7454,8 +7426,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa 48(%rax), %xmm4 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] @@ -7463,18 +7435,19 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%r9), %xmm7 +; AVX512DQ-NEXT: vmovdqa 48(%r9), %xmm6 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%r8), %xmm12 +; AVX512DQ-NEXT: vmovdqa 48(%r8), %xmm8 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm8 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] @@ -7482,80 +7455,80 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm9 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm13, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm9, %ymm26 ; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm2 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm10 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm28 ; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm13, %ymm28 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm9, %ymm29 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm30 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm25 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm27 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm23 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm26 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm19 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm22 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm17 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm20 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm16 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm18 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm14 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm3, %ymm25 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm3, %ymm20 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm21 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm18 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm17 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm7, %ymm4 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm15 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm13 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm12[8],xmm5[9],xmm12[9],xmm5[10],xmm12[10],xmm5[11],xmm12[11],xmm5[12],xmm12[12],xmm5[13],xmm12[13],xmm5[14],xmm12[14],xmm5[15],xmm12[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm4 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm4 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] @@ -7575,273 +7548,244 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm11 -; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] +; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm1, %ymm30 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm29 -; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm14 -; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm12 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm1, %ymm31 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm31 +; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm11, %ymm1, %ymm27 ; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm6 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm13 & (zmm6 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm21 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm22 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero,xmm11[4],zero,zero,zero,xmm11[5],zero,zero,zero,xmm11[6],zero,zero,zero,xmm11[7],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm8, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm10 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm10, %zmm10 ; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vpord %zmm0, %zmm3, %zmm6 {%k1} +; AVX512DQ-NEXT: vpord %zmm0, %zmm10, %zmm22 {%k1} ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm0 ^ (zmm13 & (zmm9 ^ zmm0)) +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, (%rsp), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm8, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm13 & (zmm10 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm30[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpord %zmm0, %zmm10, %zmm24 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm8, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm8, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm10 {%k1} -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm27 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm18 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm17 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm19 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm13 & (zmm4 ^ zmm2)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm19[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm17[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpandnq %zmm2, %zmm8, %zmm2 -; AVX512DQ-NEXT: vpandq %zmm8, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpord %zmm2, %zmm5, %zmm4 {%k1} -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm2, %ymm17 -; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpandnq %zmm1, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpord %zmm1, %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm20 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm3, %ymm21 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm8, %ymm3, %ymm23 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm3, %ymm22 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm9 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm2, %ymm30 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm12, %ymm14 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm6, %ymm13 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm12, %ymm15 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3],xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm19 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm15 +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm6, %ymm16 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm7, %ymm20 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm1, %ymm18 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm23 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm24 = mem[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm24 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm24, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm23 ^ (zmm13 & (zmm5 ^ zmm23)) -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm24 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm25 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm26 = mem[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm25 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm26 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; AVX512DQ-NEXT: vpandnq %zmm23, %zmm8, %zmm23 -; AVX512DQ-NEXT: vpandq %zmm8, %zmm25, %zmm25 -; AVX512DQ-NEXT: vpord %zmm23, %zmm25, %zmm5 {%k1} -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm23 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm25 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm28 = mem[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm25, %zmm25 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm24, %zmm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm25 ^ (zmm13 & (zmm11 ^ zmm25)) -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm24 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm29[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm24 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm27[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm25, %zmm16 -; AVX512DQ-NEXT: vpandnq %zmm24, %zmm8, %zmm24 -; AVX512DQ-NEXT: vpandq %zmm8, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpord %zmm24, %zmm16, %zmm11 {%k1} -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm18[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm26, %zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm16 ^ (zmm13 & (zmm7 ^ zmm16)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm21[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm22[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpandnq %zmm14, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm25, %zmm25 +; AVX512DQ-NEXT: vpord %zmm14, %zmm25, %zmm0 {%k1} +; AVX512DQ-NEXT: vpshufd $212, (%rsp), %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm21 = ymm27[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm17 +; AVX512DQ-NEXT: vpandnq %zmm14, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm17, %zmm17 +; AVX512DQ-NEXT: vpord %zmm14, %zmm17, %zmm8 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpandnq %zmm16, %zmm8, %zmm3 -; AVX512DQ-NEXT: vpandq %zmm8, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpord %zmm3, %zmm2, %zmm7 {%k1} -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpandnq %zmm9, %zmm7, %zmm3 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpord %zmm3, %zmm2, %zmm6 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm23, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (zmm13 & (zmm3 ^ zmm2)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpandnq %zmm2, %zmm8, %zmm1 -; AVX512DQ-NEXT: vpandq %zmm8, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm3 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm7, %zmm2 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpord %zmm2, %zmm1, %zmm12 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rax) ; AVX512DQ-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -8146,284 +8090,260 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa 16(%r10), %xmm12 ; AVX512BW-NEXT: vmovdqa64 32(%r10), %xmm16 ; AVX512BW-NEXT: vmovdqa 48(%r10), %xmm15 -; AVX512BW-NEXT: vmovdqa (%rax), %xmm2 +; AVX512BW-NEXT: vmovdqa (%rax), %xmm1 ; AVX512BW-NEXT: vmovdqa 16(%rax), %xmm13 ; AVX512BW-NEXT: vmovdqa64 32(%rax), %xmm17 ; AVX512BW-NEXT: vmovdqa64 48(%rax), %xmm18 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] -; AVX512BW-NEXT: vpermw %ymm1, %ymm3, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] +; AVX512BW-NEXT: vpermw %ymm3, %ymm2, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa (%r9), %xmm3 ; AVX512BW-NEXT: vmovdqa64 48(%r9), %xmm19 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm21 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] -; AVX512BW-NEXT: vpermw %ymm7, %ymm6, %ymm7 +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] +; AVX512BW-NEXT: vpermw %ymm7, %ymm5, %ymm7 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm14 ; AVX512BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm21 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] +; AVX512BW-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm24 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm26 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm28 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm30 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm11, %ymm11 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero ; AVX512BW-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm7 {%k2} ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k3} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm14, %ymm3, %ymm14 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm22, %ymm22 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm14[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm22 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm20 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm14 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 +; AVX512BW-NEXT: vpermw %ymm14, %ymm2, %ymm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm22[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm22, %ymm5, %ymm22 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 +; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm25 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm23[0],xmm21[0],xmm23[1],xmm21[1],xmm23[2],xmm21[2],xmm23[3],xmm21[3],xmm23[4],xmm21[4],xmm23[5],xmm21[5],xmm23[6],xmm21[6],xmm23[7],xmm21[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm14[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm22, %ymm22 +; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm27 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm14, %ymm9, %ymm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm22 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm22 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 -; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm25 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm14 {%k3} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm18, %ymm18 -; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm23 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm15[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm18, %ymm18 +; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm25 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm15, %ymm3, %ymm15 +; AVX512BW-NEXT: vpermw %ymm15, %ymm2, %ymm15 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm27 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm18[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm20 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm18, %ymm6, %ymm18 +; AVX512BW-NEXT: vpermw %ymm18, %ymm5, %ymm18 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm29 +; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm26 ; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm19, %zmm19 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm23[8],xmm21[8],xmm23[9],xmm21[9],xmm23[10],xmm21[10],xmm23[11],xmm21[11],xmm23[12],xmm21[12],xmm23[13],xmm21[13],xmm23[14],xmm21[14],xmm23[15],xmm21[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm15[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm15[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm21 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm15, %ymm9, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm19 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero,xmm15[4],zero,zero,zero,xmm15[5],zero,zero,zero,xmm15[6],zero,zero,zero,xmm15[7],zero,zero,zero ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm18[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm19, %ymm19 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 +; AVX512BW-NEXT: vpermw %ymm18, %ymm2, %ymm18 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,5,5,7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm19, %ymm6, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm19, %zmm24 -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm24 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm19[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm21, %ymm26 -; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm21 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm26, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm26 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm23, %ymm23 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm19, %ymm5, %ymm19 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm23 +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm20[0],xmm26[1],xmm20[1],xmm26[2],xmm20[2],xmm26[3],xmm20[3],xmm26[4],xmm20[4],xmm26[5],xmm20[5],xmm26[6],xmm20[6],xmm26[7],xmm20[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm27 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm18, %ymm9, %ymm18 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm19 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero,xmm18[4],zero,zero,zero,xmm18[5],zero,zero,zero,xmm18[6],zero,zero,zero,xmm18[7],zero,zero,zero ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm19 -; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm18 {%k3} +; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k3} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm16[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm17, %ymm17 -; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm24 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm16[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm17, %ymm17 +; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm23 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm16, %ymm3, %ymm16 +; AVX512BW-NEXT: vpermw %ymm16, %ymm2, %ymm16 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm17[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm17[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 -; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm22 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm17, %ymm6, %ymm17 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm17, %zmm17 -; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm25 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm17 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm16 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm27[8],xmm29[9],xmm27[9],xmm29[10],xmm27[10],xmm29[11],xmm27[11],xmm29[12],xmm27[12],xmm29[13],xmm27[13],xmm29[14],xmm27[14],xmm29[15],xmm27[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm20[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm23, %ymm23 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm22[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm17, %ymm24 +; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm17 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm22, %ymm5, %ymm22 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm22, %zmm25 +; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm24 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm26[8],xmm20[8],xmm26[9],xmm20[9],xmm26[10],xmm20[10],xmm26[11],xmm20[11],xmm26[12],xmm20[12],xmm26[13],xmm20[13],xmm26[14],xmm20[14],xmm26[15],xmm20[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm16[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm16[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 +; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm22 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm16, %ymm9, %ymm16 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero ; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm16 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm17[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm17, %ymm3, %ymm17 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm26[0],xmm21[0],xmm26[1],xmm21[1],xmm26[2],xmm21[2],xmm26[3],xmm21[3],xmm26[4],xmm21[4],xmm26[5],xmm21[5],xmm26[6],xmm21[6],xmm26[7],xmm21[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm20[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm23, %ymm23 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm23 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm23, %zmm17 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm23[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm23[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm27, %ymm27 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm23, %ymm11, %ymm23 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm27, %zmm23, %zmm23 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm20[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm21, %ymm21 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm20, %ymm2, %ymm20 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3],xmm23[4],xmm19[4],xmm23[5],xmm19[5],xmm23[6],xmm19[6],xmm23[7],xmm19[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm21[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm21[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm21, %ymm5, %ymm21 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm24[0],xmm17[0],xmm24[1],xmm17[1],xmm24[2],xmm17[2],xmm24[3],xmm17[3],xmm24[4],xmm17[4],xmm24[5],xmm17[5],xmm24[6],xmm17[6],xmm24[7],xmm17[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm20[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm26 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm20, %ymm9, %ymm20 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k3} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm13, %ymm13 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm12[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm13, %ymm13 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm12, %ymm3, %ymm12 +; AVX512BW-NEXT: vpermw %ymm12, %ymm2, %ymm12 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm26[8],xmm21[8],xmm26[9],xmm21[9],xmm26[10],xmm21[10],xmm26[11],xmm21[11],xmm26[12],xmm21[12],xmm26[13],xmm21[13],xmm26[14],xmm21[14],xmm26[15],xmm21[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm23[8],xmm19[8],xmm23[9],xmm19[9],xmm23[10],xmm19[10],xmm23[11],xmm19[11],xmm23[12],xmm19[12],xmm23[13],xmm19[13],xmm23[14],xmm19[14],xmm23[15],xmm19[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm13[0,1,2,3,4,5,5,7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm13[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm13, %ymm6, %ymm13 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13 +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm13, %ymm5, %ymm13 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm13, %zmm13 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm19, %zmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2 -; AVX512BW-NEXT: vpermw %ymm0, %ymm3, %ymm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm24[8],xmm17[8],xmm24[9],xmm17[9],xmm24[10],xmm17[10],xmm24[11],xmm17[11],xmm24[12],xmm17[12],xmm24[13],xmm17[13],xmm24[14],xmm17[14],xmm24[15],xmm17[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm12[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm12[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm17, %ymm17 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm12, %ymm9, %ymm12 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm12, %zmm12 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm17 {%k3} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 +; AVX512BW-NEXT: vpermw %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpermw %ymm1, %ymm5, %ymm1 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpermw %ymm0, %ymm9, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vpermw %ymm2, %ymm6, %ymm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512BW-NEXT: vpermw %ymm3, %ymm11, %ymm3 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -8608,284 +8528,260 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa 16(%r10), %xmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %xmm16 ; AVX512DQ-BW-NEXT: vmovdqa 48(%r10), %xmm15 -; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rax), %xmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %xmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rax), %xmm18 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm3, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] +; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm2, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%r9), %xmm19 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm21 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] -; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm6, %ymm7 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] +; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm5, %ymm7 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm14 ; AVX512DQ-BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm21 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm24 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm26 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm28 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm30 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm11, %ymm11 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero ; AVX512DQ-BW-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm3, %ymm14 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm22, %ymm22 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm14[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm22 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm20 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm14 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 +; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm2, %ymm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm22[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm22, %ymm5, %ymm22 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm23[0],xmm21[0],xmm23[1],xmm21[1],xmm23[2],xmm21[2],xmm23[3],xmm21[3],xmm23[4],xmm21[4],xmm23[5],xmm21[5],xmm23[6],xmm21[6],xmm23[7],xmm21[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm14[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm22, %ymm22 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm9, %ymm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm22 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm22 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm25 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm14 {%k3} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm18, %ymm18 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm23 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm15[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm18, %ymm18 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm25 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm15, %ymm3, %ymm15 +; AVX512DQ-BW-NEXT: vpermw %ymm15, %ymm2, %ymm15 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm27 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm18[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm20 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm6, %ymm18 +; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm5, %ymm18 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm26 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm19, %zmm19 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm23[8],xmm21[8],xmm23[9],xmm21[9],xmm23[10],xmm21[10],xmm23[11],xmm21[11],xmm23[12],xmm21[12],xmm23[13],xmm21[13],xmm23[14],xmm21[14],xmm23[15],xmm21[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm15[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm15[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm21 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm15, %ymm9, %ymm15 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm19 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero,xmm15[4],zero,zero,zero,xmm15[5],zero,zero,zero,xmm15[6],zero,zero,zero,xmm15[7],zero,zero,zero ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm18[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm19, %ymm19 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 +; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm2, %ymm18 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,5,5,7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm6, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm19, %zmm24 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm24 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm19[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm21, %ymm26 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm21 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm26, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm26 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm23, %ymm23 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm5, %ymm19 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm20[0],xmm26[1],xmm20[1],xmm26[2],xmm20[2],xmm26[3],xmm20[3],xmm26[4],xmm20[4],xmm26[5],xmm20[5],xmm26[6],xmm20[6],xmm26[7],xmm20[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm9, %ymm18 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero,xmm18[4],zero,zero,zero,xmm18[5],zero,zero,zero,xmm18[6],zero,zero,zero,xmm18[7],zero,zero,zero ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm19 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm18 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm19 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k3} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm16[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm24 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm16[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm17, %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm23 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm3, %ymm16 +; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm2, %ymm16 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm17[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm17[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm22 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm6, %ymm17 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm17, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm25 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm16 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm27[8],xmm29[9],xmm27[9],xmm29[10],xmm27[10],xmm29[11],xmm27[11],xmm29[12],xmm27[12],xmm29[13],xmm27[13],xmm29[14],xmm27[14],xmm29[15],xmm27[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm20[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm23, %ymm23 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm22[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm17, %ymm24 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm17 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm22, %ymm5, %ymm22 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm24 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm26[8],xmm20[8],xmm26[9],xmm20[9],xmm26[10],xmm20[10],xmm26[11],xmm20[11],xmm26[12],xmm20[12],xmm26[13],xmm20[13],xmm26[14],xmm20[14],xmm26[15],xmm20[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm16[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm16[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm22 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm9, %ymm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm17[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm3, %ymm17 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm26[0],xmm21[0],xmm26[1],xmm21[1],xmm26[2],xmm21[2],xmm26[3],xmm21[3],xmm26[4],xmm21[4],xmm26[5],xmm21[5],xmm26[6],xmm21[6],xmm26[7],xmm21[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm20[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm23, %ymm23 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm23 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm23, %zmm17 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm23[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm23[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm27, %ymm27 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm23, %ymm11, %ymm23 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm27, %zmm23, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm17 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm20[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm21, %ymm21 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm2, %ymm20 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3],xmm23[4],xmm19[4],xmm23[5],xmm19[5],xmm23[6],xmm19[6],xmm23[7],xmm19[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm21[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm21[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm5, %ymm21 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm21, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm24[0],xmm17[0],xmm24[1],xmm17[1],xmm24[2],xmm17[2],xmm24[3],xmm17[3],xmm24[4],xmm17[4],xmm24[5],xmm17[5],xmm24[6],xmm17[6],xmm24[7],xmm17[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm20[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm26 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm9, %ymm20 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k3} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm13, %ymm13 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm12[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm13, %ymm13 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm3, %ymm12 +; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm2, %ymm12 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm26[8],xmm21[8],xmm26[9],xmm21[9],xmm26[10],xmm21[10],xmm26[11],xmm21[11],xmm26[12],xmm21[12],xmm26[13],xmm21[13],xmm26[14],xmm21[14],xmm26[15],xmm21[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm23[8],xmm19[8],xmm23[9],xmm19[9],xmm23[10],xmm19[10],xmm23[11],xmm19[11],xmm23[12],xmm19[12],xmm23[13],xmm19[13],xmm23[14],xmm19[14],xmm23[15],xmm19[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm13[0,1,2,3,4,5,5,7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm13[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm6, %ymm13 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13 +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm5, %ymm13 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm13, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm19 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm19, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm19[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm12 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm3, %ymm0 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm24[8],xmm17[8],xmm24[9],xmm17[9],xmm24[10],xmm17[10],xmm24[11],xmm17[11],xmm24[12],xmm17[12],xmm24[13],xmm17[13],xmm24[14],xmm17[14],xmm24[15],xmm17[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm12[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm12[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm17, %ymm17 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm9, %ymm12 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm12, %zmm12 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm17 {%k3} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm2, %ymm0 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm5, %ymm1 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm9, %ymm0 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm6, %ymm2 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm11, %ymm3 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 448(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; From 13de15c9c49068db850368c45ffed8f7bbf07f20 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 18 Feb 2025 00:15:01 -0800 Subject: [PATCH 088/127] [clang-format] Fix a bug in annotating StartOfName (#127545) Also ensure we can break before ClassHeadName like StartOfName. Fixes #127470 --- clang/lib/Format/TokenAnnotator.cpp | 6 +++--- clang/unittests/Format/FormatTest.cpp | 5 +++++ clang/unittests/Format/TokenAnnotatorTest.cpp | 4 ++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 069fd40e2834c..e68daa422b7c4 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -2596,7 +2596,7 @@ class AnnotatingParser { (!NextNonComment && !Line.InMacroBody) || (NextNonComment && (NextNonComment->isPointerOrReference() || - NextNonComment->is(tok::string_literal) || + NextNonComment->isOneOf(TT_ClassHeadName, tok::string_literal) || (Line.InPragmaDirective && NextNonComment->is(tok::identifier))))) { return false; } @@ -6198,8 +6198,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, FormatStyle::PAS_Right && (!Right.Next || Right.Next->isNot(TT_FunctionDeclarationName))); } - if (Right.isOneOf(TT_StartOfName, TT_FunctionDeclarationName) || - Right.is(tok::kw_operator)) { + if (Right.isOneOf(TT_StartOfName, TT_FunctionDeclarationName, + TT_ClassHeadName, tok::kw_operator)) { return true; } if (Left.is(TT_PointerOrReference)) diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 2365a7c40bf76..d6d028436d39c 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -29028,6 +29028,11 @@ TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesAlways) { Style); } +TEST_F(FormatTest, BreakBeforeClassName) { + verifyFormat("class ABSL_ATTRIBUTE_TRIVIAL_ABI ABSL_NULLABILITY_COMPATIBLE\n" + " ArenaSafeUniquePtr {};"); +} + } // namespace } // namespace test } // namespace format diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 1d0870c818acc..8ada6c3daeaf6 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3250,6 +3250,10 @@ TEST_F(TokenAnnotatorTest, StartOfName) { EXPECT_TOKEN(Tokens[0], tok::at, TT_ObjCDecl); EXPECT_TOKEN(Tokens[2], tok::identifier, TT_StartOfName); + Tokens = annotate("class FOO BAR C {};"); + ASSERT_EQ(Tokens.size(), 8u) << Tokens; + EXPECT_TOKEN(Tokens[2], tok::identifier, TT_Unknown); // Not StartOfName + auto Style = getLLVMStyle(); Style.StatementAttributeLikeMacros.push_back("emit"); Tokens = annotate("emit foo = 0;", Style); From e235fcb582eec5f58c905b66f96d0732d17b875e Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 18 Feb 2025 09:17:51 +0100 Subject: [PATCH 089/127] [BOLT] Only link and initialize supported targets (#127509) Bolt currently links and initializes all LLVM targets. This substantially increases the binary size, and link time if LTO is used. Instead, only link the targets specified by BOLT_TARGETS_TO_BUILD. We also have to only initialize those targets, so generate a TargetConfig.def file with the necessary information. The way the initialization is done mirrors what llvm-exegesis does. This reduces llvm-bolt size from 137MB to 78MB for me. --- bolt/CMakeLists.txt | 8 +++++++ bolt/include/bolt/Core/TargetConfig.def.in | 23 +++++++++++++++++++ bolt/tools/binary-analysis/CMakeLists.txt | 2 +- .../tools/binary-analysis/binary-analysis.cpp | 16 +++++++------ bolt/tools/driver/CMakeLists.txt | 2 +- bolt/tools/driver/llvm-bolt.cpp | 16 +++++++------ bolt/tools/heatmap/CMakeLists.txt | 2 +- bolt/tools/heatmap/heatmap.cpp | 16 +++++++------ bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt | 2 +- .../llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp | 15 +++++++----- bolt/unittests/Core/BinaryContext.cpp | 15 +++++++----- bolt/unittests/Core/CMakeLists.txt | 2 +- bolt/unittests/Core/MCPlusBuilder.cpp | 15 +++++++----- bolt/unittests/Core/MemoryMaps.cpp | 15 +++++++----- 14 files changed, 99 insertions(+), 50 deletions(-) create mode 100644 bolt/include/bolt/Core/TargetConfig.def.in diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 04db160b64b05..f5ffa81227064 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -202,3 +202,11 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/bolt/RuntimeLibs/RuntimeLibraryVariables.inc.in ${CMAKE_CURRENT_BINARY_DIR}/include/bolt/RuntimeLibs/RuntimeLibraryVariables.inc @ONLY) + +set(BOLT_ENUM_TARGETS "") +foreach(t ${BOLT_TARGETS_TO_BUILD}) + set(BOLT_ENUM_TARGETS "${BOLT_ENUM_TARGETS}BOLT_TARGET(${t})\n") +endforeach(t) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/bolt/Core/TargetConfig.def.in + ${CMAKE_CURRENT_BINARY_DIR}/include/bolt/Core/TargetConfig.def @ONLY) diff --git a/bolt/include/bolt/Core/TargetConfig.def.in b/bolt/include/bolt/Core/TargetConfig.def.in new file mode 100644 index 0000000000000..a52ebd92b56fd --- /dev/null +++ b/bolt/include/bolt/Core/TargetConfig.def.in @@ -0,0 +1,23 @@ +//===-- TargetConfig.def.in - Information about available targets ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is configured by the build system to define the available bolt +// targets. +// +// The variant of this file not ending with .in has been autogenerated by the +// LLVM build. Do not edit! +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_TARGET +# error Please define the macro BOLT_TARGET(TargetName) +#endif + +@BOLT_ENUM_TARGETS@ + +#undef BOLT_TARGET diff --git a/bolt/tools/binary-analysis/CMakeLists.txt b/bolt/tools/binary-analysis/CMakeLists.txt index 841fc5b371185..29f224e0f66ff 100644 --- a/bolt/tools/binary-analysis/CMakeLists.txt +++ b/bolt/tools/binary-analysis/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} MC Object Support diff --git a/bolt/tools/binary-analysis/binary-analysis.cpp b/bolt/tools/binary-analysis/binary-analysis.cpp index b03fee3e025ae..0e3584eeedd18 100644 --- a/bolt/tools/binary-analysis/binary-analysis.cpp +++ b/bolt/tools/binary-analysis/binary-analysis.cpp @@ -88,13 +88,15 @@ int main(int argc, char **argv) { llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. // Initialize targets and assembly printers/parsers. - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" ParseCommandLine(argc, argv); diff --git a/bolt/tools/driver/CMakeLists.txt b/bolt/tools/driver/CMakeLists.txt index 9bf9ff85edc7b..4b3c7416de974 100644 --- a/bolt/tools/driver/CMakeLists.txt +++ b/bolt/tools/driver/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} MC Object Support diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp index f151cf5f63fc5..6b6714723fa3b 100644 --- a/bolt/tools/driver/llvm-bolt.cpp +++ b/bolt/tools/driver/llvm-bolt.cpp @@ -183,13 +183,15 @@ int main(int argc, char **argv) { std::string ToolPath = llvm::sys::fs::getMainExecutable(argv[0], nullptr); // Initialize targets and assembly printers/parsers. - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" ToolName = argv[0]; diff --git a/bolt/tools/heatmap/CMakeLists.txt b/bolt/tools/heatmap/CMakeLists.txt index acddc7a50e8b1..c5d3f67413929 100644 --- a/bolt/tools/heatmap/CMakeLists.txt +++ b/bolt/tools/heatmap/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} MC Object Support diff --git a/bolt/tools/heatmap/heatmap.cpp b/bolt/tools/heatmap/heatmap.cpp index 3bb9f2ce7491d..6add36cc6715f 100644 --- a/bolt/tools/heatmap/heatmap.cpp +++ b/bolt/tools/heatmap/heatmap.cpp @@ -76,13 +76,15 @@ int main(int argc, char **argv) { opts::OutputFilename = "-"; // Initialize targets and assembly printers/parsers. - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" ToolName = argv[0]; std::string ToolPath = GetExecutablePath(argv[0]); diff --git a/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt b/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt index f21285f634bad..7eaacb74a9da6 100644 --- a/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt +++ b/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} ) add_llvm_fuzzer(llvm-bolt-fuzzer diff --git a/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp b/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp index bdb5768a91da1..09049788aebec 100644 --- a/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp +++ b/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp @@ -58,13 +58,16 @@ extern "C" int LLVMFuzzerTestOneInput(const char *Data, size_t Size) { extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(int *argc, char ***argv) { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); + // Initialize targets and assembly printers/parsers. +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#include "bolt/Core/TargetConfig.def" return 0; } diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp index 0fefa1b83c3c2..09d16966334da 100644 --- a/bolt/unittests/Core/BinaryContext.cpp +++ b/bolt/unittests/Core/BinaryContext.cpp @@ -27,12 +27,15 @@ struct BinaryContextTester : public testing::TestWithParam { protected: void initalizeLLVM() { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" } void prepareElf() { diff --git a/bolt/unittests/Core/CMakeLists.txt b/bolt/unittests/Core/CMakeLists.txt index 208cf6ced7358..8ac88b701ea05 100644 --- a/bolt/unittests/Core/CMakeLists.txt +++ b/bolt/unittests/Core/CMakeLists.txt @@ -2,7 +2,7 @@ set(LLVM_LINK_COMPONENTS DebugInfoDWARF Object MC - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} ) add_bolt_unittest(CoreTests diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp index 5488cae366284..d367eb07f7767 100644 --- a/bolt/unittests/Core/MCPlusBuilder.cpp +++ b/bolt/unittests/Core/MCPlusBuilder.cpp @@ -37,12 +37,15 @@ struct MCPlusBuilderTester : public testing::TestWithParam { protected: void initalizeLLVM() { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" } void prepareElf() { diff --git a/bolt/unittests/Core/MemoryMaps.cpp b/bolt/unittests/Core/MemoryMaps.cpp index 06073d0a82e14..2e1bc4d280aed 100644 --- a/bolt/unittests/Core/MemoryMaps.cpp +++ b/bolt/unittests/Core/MemoryMaps.cpp @@ -38,12 +38,15 @@ struct MemoryMapsTester : public testing::TestWithParam { protected: void initalizeLLVM() { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" } void prepareElf() { From 03cb46d248b08fa7ca740d78f0847adcc3e76ad8 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 18 Feb 2025 09:29:25 +0100 Subject: [PATCH 090/127] [CodeGen] Use getSignedConstant() in more places (#127501) Use getSignedConstant() in a few more places, based on a search of `\bgetConstant(-`. Most of these were fine as-is (e.g. because they work on 64-bits), but I think it's better to use getSignedConstant() consistently for negative numbers. --- .../Target/AArch64/AArch64ISelLowering.cpp | 20 ++++++++++--------- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +++--- .../Target/Hexagon/HexagonISelDAGToDAG.cpp | 3 ++- .../Target/Hexagon/HexagonISelLowering.cpp | 2 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 18 ++++++++--------- llvm/lib/Target/VE/VEISelLowering.cpp | 12 ++++++----- 7 files changed, 34 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8f849af6f4d35..d47a0bfa4fc50 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11780,8 +11780,9 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (Align && *Align > MinSlotSize) { VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align->value() - 1, DL, PtrVT)); - VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, - DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); + VAList = + DAG.getNode(ISD::AND, DL, PtrVT, VAList, + DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); @@ -16147,8 +16148,9 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) - SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + SP = + DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); @@ -16185,7 +16187,7 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); @@ -16213,7 +16215,7 @@ AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT)); // Set the real SP to the new value with a probing loop. Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); @@ -21485,7 +21487,7 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { Op = DAG.getNode(Opcode, dl, VT, Op, - DAG.getConstant(-ShiftAmount, dl, MVT::i32)); + DAG.getSignedConstant(-ShiftAmount, dl, MVT::i32)); if (N->getValueType(0) == MVT::i64) Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, DAG.getConstant(0, dl, MVT::i64)); @@ -27364,10 +27366,10 @@ static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SDLoc dl(Val128); Val2x64.first = DAG.getNode(ISD::XOR, dl, MVT::i64, - DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first); + DAG.getAllOnesConstant(dl, MVT::i64), Val2x64.first); Val2x64.second = DAG.getNode(ISD::XOR, dl, MVT::i64, - DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second); + DAG.getAllOnesConstant(dl, MVT::i64), Val2x64.second); } SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain}; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index bb78e77a9dc1a..4fd68b52b53bb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -913,7 +913,7 @@ class VGPRImm : PatLeafgetConstant(-N->getSExtValue(), SDLoc(N), MVT::i32); + return CurDAG->getSignedConstant(-N->getSExtValue(), SDLoc(N), MVT::i32); }]>; // TODO: When FP inline imm values work? diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2bac1d0086041..eb1491feb611e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -20786,9 +20786,9 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); if (Align) - SP = - DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); + SP = DAG.getNode( + ISD::AND, DL, MVT::i32, SP.getValue(0), + DAG.getSignedConstant(-(uint64_t)Align->value(), DL, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); SDValue Ops[2] = { SP, Chain }; return DAG.getMergeValues(Ops, DL); diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 10db4f552cdcf..c0baf301e0624 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -617,7 +617,8 @@ void HexagonDAGToDAGISel::SelectSHL(SDNode *N) { if (ConstantSDNode *C2 = dyn_cast(Shl2_1)) { int32_t ValConst = 1 << (ShlConst + C2->getSExtValue()); if (isInt<9>(-ValConst)) { - SDValue Val = CurDAG->getTargetConstant(-ValConst, dl, MVT::i32); + SDValue Val = + CurDAG->getSignedTargetConstant(-ValConst, dl, MVT::i32); SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl, MVT::i32, Shl2_0, Val); ReplaceNode(N, Result); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index b31360b4096da..d66e3e306d2ff 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3491,7 +3491,7 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N, SDValue P = Op.getOperand(0); switch (P.getOpcode()) { case HexagonISD::PTRUE: - return DCI.DAG.getConstant(-1, dl, ty(Op)); + return DCI.DAG.getAllOnesConstant(dl, ty(Op)); case HexagonISD::PFALSE: return getZero(dl, ty(Op), DCI.DAG); default: diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 767d1ded8de3a..4720928f472b3 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -8883,8 +8883,8 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, Round = DAG.getNode(ISD::ADD, dl, MVT::i64, Round, DAG.getConstant(2047, dl, MVT::i64)); Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); - Round = DAG.getNode(ISD::AND, dl, MVT::i64, - Round, DAG.getConstant(-2048, dl, MVT::i64)); + Round = DAG.getNode(ISD::AND, dl, MVT::i64, Round, + DAG.getSignedConstant(-2048, dl, MVT::i64)); // However, we cannot use that value unconditionally: if the magnitude // of the input value is small, the bit-twiddling we did above might @@ -9244,7 +9244,7 @@ SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op, SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - unsigned BitWidth = VT.getSizeInBits(); + uint64_t BitWidth = VT.getSizeInBits(); SDLoc dl(Op); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && @@ -9263,7 +9263,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, - DAG.getConstant(-BitWidth, dl, AmtVT)); + DAG.getSignedConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); @@ -9274,7 +9274,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); - unsigned BitWidth = VT.getSizeInBits(); + uint64_t BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SRL!"); @@ -9292,7 +9292,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, - DAG.getConstant(-BitWidth, dl, AmtVT)); + DAG.getSignedConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); @@ -9303,7 +9303,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); - unsigned BitWidth = VT.getSizeInBits(); + uint64_t BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SRA!"); @@ -9320,7 +9320,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, - DAG.getConstant(-BitWidth, dl, AmtVT)); + DAG.getSignedConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), @@ -18308,7 +18308,7 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue AddOrZ = NegConstant != 0 ? Add : Z; SDValue Addc = DAG.getNode(ISD::UADDO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), - AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64), + AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64), DAG.getConstant(0, DL, CarryType)); return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index aff058868f306..62064579b4bdf 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -1216,8 +1216,9 @@ SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op, SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits); SDValue Ptr = N->getOperand(1); - SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), - {Ptr, DAG.getConstant(-4, DL, MVT::i64)}); + SDValue Aligned = + DAG.getNode(ISD::AND, DL, Ptr.getValueType(), + {Ptr, DAG.getSignedConstant(-4, DL, MVT::i64)}); SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(), DAG.getVTList(Op.getNode()->getValueType(0), Op.getNode()->getValueType(1)), @@ -1235,8 +1236,9 @@ SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op, SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits); SDValue Ptr = N->getOperand(1); - SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), - {Ptr, DAG.getConstant(-4, DL, MVT::i64)}); + SDValue Aligned = + DAG.getNode(ISD::AND, DL, Ptr.getValueType(), + {Ptr, DAG.getSignedConstant(-4, DL, MVT::i64)}); SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(), DAG.getVTList(Op.getNode()->getValueType(0), Op.getNode()->getValueType(1)), @@ -1601,7 +1603,7 @@ SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const { VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, - DAG.getConstant(-Align, DL, PtrVT)); + DAG.getSignedConstant(-Align, DL, PtrVT)); // Increment the pointer, VAList, by 16 to the next vaarg. NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL)); From 6a360b313d27e46988d573a663e9127622eb205c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 15:37:20 +0700 Subject: [PATCH 091/127] AMDGPU: Remove redundant inline constant check (#127582) We don't really care of this is an inline constant, only if it will be legal. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 999553bfaff38..7c08a21dea3b8 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -824,8 +824,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; - if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && - TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { + if (OpToFold.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); return true; } @@ -845,8 +844,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) { MachineOperand &DefOp = Def->getOperand(1); - if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) && - TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { + if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm()); return true; } From f7c71f162269a10a635c4125142ae8b0a194f3aa Mon Sep 17 00:00:00 2001 From: Jason Rice Date: Tue, 18 Feb 2025 00:42:24 -0800 Subject: [PATCH 092/127] [Clang][P1061] Consolidate ResolvedUnpexandedPackExpr into FunctionParmPackExpr (#125394) This merges the functionality of ResolvedUnexpandedPackExpr into FunctionParmPackExpr. I also added a test to show that https://github.com/llvm/llvm-project/issues/125103 should be fixed with this. I put the removal of ResolvedUnexpandedPackExpr in its own commit. Let me know what you think. Fixes #125103 --- clang/include/clang/AST/DeclCXX.h | 28 +++---- clang/include/clang/AST/ExprCXX.h | 78 +++---------------- clang/include/clang/AST/RecursiveASTVisitor.h | 1 - clang/include/clang/Basic/StmtNodes.td | 1 - clang/include/clang/Sema/Sema.h | 3 +- clang/include/clang/Sema/Template.h | 2 +- .../include/clang/Serialization/ASTBitCodes.h | 1 - clang/lib/AST/DeclCXX.cpp | 9 ++- clang/lib/AST/Expr.cpp | 1 - clang/lib/AST/ExprCXX.cpp | 63 ++------------- clang/lib/AST/ExprClassification.cpp | 7 -- clang/lib/AST/ExprConstant.cpp | 1 - clang/lib/AST/ItaniumMangle.cpp | 3 +- clang/lib/AST/StmtPrinter.cpp | 9 --- clang/lib/AST/StmtProfile.cpp | 4 - clang/lib/Sema/SemaDeclCXX.cpp | 26 +++---- clang/lib/Sema/SemaExceptionSpec.cpp | 1 - clang/lib/Sema/SemaExpr.cpp | 6 +- clang/lib/Sema/SemaTemplateInstantiate.cpp | 47 +++-------- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 43 +++++----- clang/lib/Sema/SemaTemplateVariadic.cpp | 48 +++--------- clang/lib/Sema/TreeTransform.h | 25 ------ clang/lib/Serialization/ASTReaderStmt.cpp | 22 +----- clang/lib/Serialization/ASTWriter.cpp | 1 - clang/lib/Serialization/ASTWriterStmt.cpp | 10 --- clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 1 - clang/test/AST/ast-dump-binding-pack.cpp | 13 +--- clang/test/SemaCXX/cxx2c-binding-pack.cpp | 30 +++++++ clang/tools/libclang/CXCursor.cpp | 1 - 29 files changed, 130 insertions(+), 355 deletions(-) diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 766821b4fb25c..266b93a64a390 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -4194,8 +4194,8 @@ class BindingDecl : public ValueDecl { /// decomposition declaration, and when the initializer is type-dependent. Expr *getBinding() const { return Binding; } - // Get the array of Exprs when the binding represents a pack. - llvm::ArrayRef getBindingPackExprs() const; + // Get the array of nested BindingDecls when the binding represents a pack. + llvm::ArrayRef getBindingPackDecls() const; /// Get the decomposition declaration that this binding represents a /// decomposition of. @@ -4246,10 +4246,8 @@ class DecompositionDecl final for (auto *B : Bindings) { B->setDecomposedDecl(this); if (B->isParameterPack() && B->getBinding()) { - for (Expr *E : B->getBindingPackExprs()) { - auto *DRE = cast(E); - auto *NestedB = cast(DRE->getDecl()); - NestedB->setDecomposedDecl(this); + for (BindingDecl *NestedBD : B->getBindingPackDecls()) { + NestedBD->setDecomposedDecl(this); } } } @@ -4278,25 +4276,21 @@ class DecompositionDecl final // Provide a flattened range to visit each binding. auto flat_bindings() const { llvm::ArrayRef Bindings = bindings(); - llvm::ArrayRef PackExprs; + llvm::ArrayRef PackBindings; // Split the bindings into subranges split by the pack. - auto S1 = Bindings.take_until( + llvm::ArrayRef BeforePackBindings = Bindings.take_until( [](BindingDecl *BD) { return BD->isParameterPack(); }); - Bindings = Bindings.drop_front(S1.size()); + Bindings = Bindings.drop_front(BeforePackBindings.size()); if (!Bindings.empty()) { - PackExprs = Bindings.front()->getBindingPackExprs(); + PackBindings = Bindings.front()->getBindingPackDecls(); Bindings = Bindings.drop_front(); } - auto S2 = llvm::map_range(PackExprs, [](Expr *E) { - auto *DRE = cast(E); - return cast(DRE->getDecl()); - }); - - return llvm::concat(std::move(S1), std::move(S2), - std::move(Bindings)); + return llvm::concat(std::move(BeforePackBindings), + std::move(PackBindings), + std::move(Bindings)); } void printName(raw_ostream &OS, const PrintingPolicy &Policy) const override; diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 98ba2bb41bb54..abc65e77da021 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4633,8 +4633,8 @@ class SubstNonTypeTemplateParmPackExpr : public Expr { } }; -/// Represents a reference to a function parameter pack or init-capture pack -/// that has been substituted but not yet expanded. +/// Represents a reference to a function parameter pack, init-capture pack, +/// or binding pack that has been substituted but not yet expanded. /// /// When a pack expansion contains multiple parameter packs at different levels, /// this node is used to represent a function parameter pack at an outer level @@ -4649,13 +4649,13 @@ class SubstNonTypeTemplateParmPackExpr : public Expr { /// \endcode class FunctionParmPackExpr final : public Expr, - private llvm::TrailingObjects { + private llvm::TrailingObjects { friend class ASTReader; friend class ASTStmtReader; friend TrailingObjects; /// The function parameter pack which was referenced. - VarDecl *ParamPack; + ValueDecl *ParamPack; /// The location of the function parameter pack reference. SourceLocation NameLoc; @@ -4663,35 +4663,34 @@ class FunctionParmPackExpr final /// The number of expansions of this pack. unsigned NumParameters; - FunctionParmPackExpr(QualType T, VarDecl *ParamPack, - SourceLocation NameLoc, unsigned NumParams, - VarDecl *const *Params); + FunctionParmPackExpr(QualType T, ValueDecl *ParamPack, SourceLocation NameLoc, + unsigned NumParams, ValueDecl *const *Params); public: static FunctionParmPackExpr *Create(const ASTContext &Context, QualType T, - VarDecl *ParamPack, + ValueDecl *ParamPack, SourceLocation NameLoc, - ArrayRef Params); + ArrayRef Params); static FunctionParmPackExpr *CreateEmpty(const ASTContext &Context, unsigned NumParams); /// Get the parameter pack which this expression refers to. - VarDecl *getParameterPack() const { return ParamPack; } + ValueDecl *getParameterPack() const { return ParamPack; } /// Get the location of the parameter pack. SourceLocation getParameterPackLocation() const { return NameLoc; } /// Iterators over the parameters which the parameter pack expanded /// into. - using iterator = VarDecl * const *; - iterator begin() const { return getTrailingObjects(); } + using iterator = ValueDecl *const *; + iterator begin() const { return getTrailingObjects(); } iterator end() const { return begin() + NumParameters; } /// Get the number of parameters in this parameter pack. unsigned getNumExpansions() const { return NumParameters; } /// Get an expansion of the parameter pack by index. - VarDecl *getExpansion(unsigned I) const { return begin()[I]; } + ValueDecl *getExpansion(unsigned I) const { return begin()[I]; } SourceLocation getBeginLoc() const LLVM_READONLY { return NameLoc; } SourceLocation getEndLoc() const LLVM_READONLY { return NameLoc; } @@ -5319,59 +5318,6 @@ class BuiltinBitCastExpr final } }; -// Represents an unexpanded pack where the list of expressions are -// known. These are used when structured bindings introduce a pack. -class ResolvedUnexpandedPackExpr final - : public Expr, - private llvm::TrailingObjects { - friend class ASTStmtReader; - friend class ASTStmtWriter; - friend TrailingObjects; - - SourceLocation BeginLoc; - unsigned NumExprs; - - ResolvedUnexpandedPackExpr(SourceLocation BL, QualType QT, unsigned NumExprs); - -public: - static ResolvedUnexpandedPackExpr *CreateDeserialized(ASTContext &C, - unsigned NumExprs); - static ResolvedUnexpandedPackExpr * - Create(ASTContext &C, SourceLocation BeginLoc, QualType T, unsigned NumExprs); - static ResolvedUnexpandedPackExpr *Create(ASTContext &C, - SourceLocation BeginLoc, QualType T, - llvm::ArrayRef Exprs); - - unsigned getNumExprs() const { return NumExprs; } - - llvm::MutableArrayRef getExprs() { - return {getTrailingObjects(), NumExprs}; - } - - llvm::ArrayRef getExprs() const { - return {getTrailingObjects(), NumExprs}; - } - - Expr *getExpansion(unsigned Idx) { return getExprs()[Idx]; } - Expr *getExpansion(unsigned Idx) const { return getExprs()[Idx]; } - - // Iterators - child_range children() { - return child_range((Stmt **)getTrailingObjects(), - (Stmt **)getTrailingObjects() + getNumExprs()); - } - - SourceLocation getBeginLoc() const LLVM_READONLY { return BeginLoc; } - SourceLocation getEndLoc() const LLVM_READONLY { return BeginLoc; } - - // Returns the resolved pack of a decl or nullptr - static ResolvedUnexpandedPackExpr *getFromDecl(Decl *); - - static bool classof(const Stmt *T) { - return T->getStmtClass() == ResolvedUnexpandedPackExprClass; - } -}; - } // namespace clang #endif // LLVM_CLANG_AST_EXPRCXX_H diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 560de7da9913a..5964cbaec8e44 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -2950,7 +2950,6 @@ DEF_TRAVERSE_STMT(FunctionParmPackExpr, {}) DEF_TRAVERSE_STMT(CXXFoldExpr, {}) DEF_TRAVERSE_STMT(AtomicExpr, {}) DEF_TRAVERSE_STMT(CXXParenListInitExpr, {}) -DEF_TRAVERSE_STMT(ResolvedUnexpandedPackExpr, {}) DEF_TRAVERSE_STMT(MaterializeTemporaryExpr, { if (S->getLifetimeExtendedTemporaryDecl()) { diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td index 3533c5f50742e..ae49671058a01 100644 --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -163,7 +163,6 @@ def MaterializeTemporaryExpr : StmtNode; def LambdaExpr : StmtNode; def CXXFoldExpr : StmtNode; def CXXParenListInitExpr: StmtNode; -def ResolvedUnexpandedPackExpr : StmtNode; // C++ Coroutines expressions def CoroutineSuspendExpr : StmtNode; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index a501b901862b6..c55b964650323 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -232,8 +232,7 @@ void threadSafetyCleanup(BeforeSet *Cache); // FIXME: No way to easily map from TemplateTypeParmTypes to // TemplateTypeParmDecls, so we have this horrible PointerUnion. -typedef std::pair, +typedef std::pair, SourceLocation> UnexpandedParameterPack; diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h index 4206bd50b13dd..647c4cfa341e1 100644 --- a/clang/include/clang/Sema/Template.h +++ b/clang/include/clang/Sema/Template.h @@ -365,7 +365,7 @@ enum class TemplateSubstitutionKind : char { class LocalInstantiationScope { public: /// A set of declarations. - using DeclArgumentPack = SmallVector; + using DeclArgumentPack = SmallVector; private: /// Reference to the semantic analysis that is performing diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index ad93d50f6a82b..37cdb0fc9faa8 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1908,7 +1908,6 @@ enum StmtCode { EXPR_PACK_EXPANSION, // PackExpansionExpr EXPR_PACK_INDEXING, // PackIndexingExpr EXPR_SIZEOF_PACK, // SizeOfPackExpr - EXPR_RESOLVED_UNEXPANDED_PACK, // ResolvedUnexpandedPackExpr EXPR_SUBST_NON_TYPE_TEMPLATE_PARM, // SubstNonTypeTemplateParmExpr EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK, // SubstNonTypeTemplateParmPackExpr EXPR_FUNCTION_PARM_PACK, // FunctionParmPackExpr diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 1aa48f0026335..7eff776882629 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -3504,10 +3504,13 @@ VarDecl *BindingDecl::getHoldingVar() const { return VD; } -llvm::ArrayRef BindingDecl::getBindingPackExprs() const { +llvm::ArrayRef BindingDecl::getBindingPackDecls() const { assert(Binding && "expecting a pack expr"); - auto *RP = cast(Binding); - return RP->getExprs(); + auto *FP = cast(Binding); + ValueDecl *const *First = FP->getNumExpansions() > 0 ? FP->begin() : nullptr; + assert((!First || isa(*First)) && "expecting a BindingDecl"); + return llvm::ArrayRef( + reinterpret_cast(First), FP->getNumExpansions()); } void DecompositionDecl::anchor() {} diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 460167c1b9a3d..6f570139630d8 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -3672,7 +3672,6 @@ bool Expr::HasSideEffects(const ASTContext &Ctx, case PackIndexingExprClass: case HLSLOutArgExprClass: case OpenACCAsteriskSizeExprClass: - case ResolvedUnexpandedPackExprClass: // These never have a side-effect. return false; diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index d900af895b42a..c8d61e2cf3f26 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -1779,31 +1779,31 @@ TemplateArgument SubstNonTypeTemplateParmPackExpr::getArgumentPack() const { return TemplateArgument(llvm::ArrayRef(Arguments, NumArguments)); } -FunctionParmPackExpr::FunctionParmPackExpr(QualType T, VarDecl *ParamPack, +FunctionParmPackExpr::FunctionParmPackExpr(QualType T, ValueDecl *ParamPack, SourceLocation NameLoc, unsigned NumParams, - VarDecl *const *Params) + ValueDecl *const *Params) : Expr(FunctionParmPackExprClass, T, VK_LValue, OK_Ordinary), ParamPack(ParamPack), NameLoc(NameLoc), NumParameters(NumParams) { if (Params) std::uninitialized_copy(Params, Params + NumParams, - getTrailingObjects()); + getTrailingObjects()); setDependence(ExprDependence::TypeValueInstantiation | ExprDependence::UnexpandedPack); } FunctionParmPackExpr * FunctionParmPackExpr::Create(const ASTContext &Context, QualType T, - VarDecl *ParamPack, SourceLocation NameLoc, - ArrayRef Params) { - return new (Context.Allocate(totalSizeToAlloc(Params.size()))) + ValueDecl *ParamPack, SourceLocation NameLoc, + ArrayRef Params) { + return new (Context.Allocate(totalSizeToAlloc(Params.size()))) FunctionParmPackExpr(T, ParamPack, NameLoc, Params.size(), Params.data()); } FunctionParmPackExpr * FunctionParmPackExpr::CreateEmpty(const ASTContext &Context, unsigned NumParams) { - return new (Context.Allocate(totalSizeToAlloc(NumParams))) + return new (Context.Allocate(totalSizeToAlloc(NumParams))) FunctionParmPackExpr(QualType(), nullptr, SourceLocation(), 0, nullptr); } @@ -1965,52 +1965,3 @@ CXXFoldExpr::CXXFoldExpr(QualType T, UnresolvedLookupExpr *Callee, SubExprs[SubExpr::RHS] = RHS; setDependence(computeDependence(this)); } - -ResolvedUnexpandedPackExpr::ResolvedUnexpandedPackExpr(SourceLocation BL, - QualType QT, - unsigned NumExprs) - : Expr(ResolvedUnexpandedPackExprClass, QT, VK_PRValue, OK_Ordinary), - BeginLoc(BL), NumExprs(NumExprs) { - // C++ [temp.dep.expr]p3 - // An id-expression is type-dependent if it is - // - associated by name lookup with a pack - setDependence(ExprDependence::TypeValueInstantiation | - ExprDependence::UnexpandedPack); -} - -ResolvedUnexpandedPackExpr * -ResolvedUnexpandedPackExpr::CreateDeserialized(ASTContext &Ctx, - unsigned NumExprs) { - void *Mem = Ctx.Allocate(totalSizeToAlloc(NumExprs), - alignof(ResolvedUnexpandedPackExpr)); - return new (Mem) - ResolvedUnexpandedPackExpr(SourceLocation(), QualType(), NumExprs); -} - -ResolvedUnexpandedPackExpr * -ResolvedUnexpandedPackExpr::Create(ASTContext &Ctx, SourceLocation BL, - QualType T, unsigned NumExprs) { - void *Mem = Ctx.Allocate(totalSizeToAlloc(NumExprs), - alignof(ResolvedUnexpandedPackExpr)); - ResolvedUnexpandedPackExpr *New = - new (Mem) ResolvedUnexpandedPackExpr(BL, T, NumExprs); - - auto Exprs = New->getExprs(); - std::uninitialized_fill(Exprs.begin(), Exprs.end(), nullptr); - - return New; -} - -ResolvedUnexpandedPackExpr * -ResolvedUnexpandedPackExpr::Create(ASTContext &Ctx, SourceLocation BL, - QualType T, ArrayRef Exprs) { - auto *New = Create(Ctx, BL, T, Exprs.size()); - std::uninitialized_copy(Exprs.begin(), Exprs.end(), New->getExprs().begin()); - return New; -} - -ResolvedUnexpandedPackExpr *ResolvedUnexpandedPackExpr::getFromDecl(Decl *D) { - if (auto *BD = dyn_cast(D)) - return dyn_cast_if_present(BD->getBinding()); - return nullptr; -} diff --git a/clang/lib/AST/ExprClassification.cpp b/clang/lib/AST/ExprClassification.cpp index 5225c3ca773ad..3f37d06cc8f3a 100644 --- a/clang/lib/AST/ExprClassification.cpp +++ b/clang/lib/AST/ExprClassification.cpp @@ -451,13 +451,6 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) { case Expr::PackExpansionExprClass: return ClassifyInternal(Ctx, cast(E)->getPattern()); - case Expr::ResolvedUnexpandedPackExprClass: { - if (cast(E)->getNumExprs() > 0) - return ClassifyInternal( - Ctx, cast(E)->getExpansion(0)); - return Cl::CL_LValue; - } - case Expr::MaterializeTemporaryExprClass: return cast(E)->isBoundToLvalueReference() ? Cl::CL_LValue diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 043974fb41443..6ccb6e23f8d2f 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -17253,7 +17253,6 @@ static ICEDiag CheckICE(const Expr* E, const ASTContext &Ctx) { case Expr::SYCLUniqueStableNameExprClass: case Expr::CXXParenListInitExprClass: case Expr::HLSLOutArgExprClass: - case Expr::ResolvedUnexpandedPackExprClass: return ICEDiag(IK_NotICE, E->getBeginLoc()); case Expr::InitListExprClass: { diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index e5eb22eae7dd1..4a090118c3d7b 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -4932,8 +4932,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity, case Expr::AtomicExprClass: case Expr::SourceLocExprClass: case Expr::EmbedExprClass: - case Expr::BuiltinBitCastExprClass: - case Expr::ResolvedUnexpandedPackExprClass: { + case Expr::BuiltinBitCastExprClass: { NotPrimaryExpr(); if (!NullOut) { // As bad as this diagnostic is, it's better than crashing. diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index 4b45190fa33ef..c8ea7b52a6241 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -2609,15 +2609,6 @@ void StmtPrinter::VisitPackIndexingExpr(PackIndexingExpr *E) { OS << "]"; } -void StmtPrinter::VisitResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - OS << "<getExprs().begin(), E->getExprs().end(), - [this](auto *X) { PrintExpr(X); }, [this] { OS << ", "; }); - OS << ")>>"; -} - void StmtPrinter::VisitSubstNonTypeTemplateParmPackExpr( SubstNonTypeTemplateParmPackExpr *Node) { OS << *Node->getParameterPack(); diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 77ee6611f623f..2603df25ba2a4 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -2287,10 +2287,6 @@ void StmtProfiler::VisitSizeOfPackExpr(const SizeOfPackExpr *S) { ID.AddInteger(0); } } -void StmtProfiler::VisitResolvedUnexpandedPackExpr( - const ResolvedUnexpandedPackExpr *S) { - VisitExpr(S); -} void StmtProfiler::VisitPackIndexingExpr(const PackIndexingExpr *E) { VisitExpr(E); diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 0cf02fe6407c2..664d48ccbc382 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -980,24 +980,24 @@ static bool CheckBindingsCount(Sema &S, DecompositionDecl *DD, if (IsValid && HasPack) { // Create the pack expr and assign it to the binding. unsigned PackSize = MemberCount - Bindings.size() + 1; - QualType PackType = S.Context.getPackExpansionType( - S.Context.DependentTy, std::nullopt, /*ExpectsPackInType=*/false); - BindingDecl *BD = (*BindingWithPackItr); - auto *RP = ResolvedUnexpandedPackExpr::Create(S.Context, DD->getBeginLoc(), - DecompType, PackSize); - BD->setDecomposedDecl(DD); - BD->setBinding(PackType, RP); BindingDecl *BPack = *BindingWithPackItr; + BPack->setDecomposedDecl(DD); + SmallVector NestedBDs(PackSize); // Create the nested BindingDecls. - for (Expr *&E : RP->getExprs()) { - auto *NestedBD = BindingDecl::Create(S.Context, BPack->getDeclContext(), - BPack->getLocation(), - BPack->getIdentifier(), QualType()); + for (unsigned I = 0; I < PackSize; ++I) { + BindingDecl *NestedBD = BindingDecl::Create( + S.Context, BPack->getDeclContext(), BPack->getLocation(), + BPack->getIdentifier(), QualType()); NestedBD->setDecomposedDecl(DD); - E = S.BuildDeclRefExpr(NestedBD, S.Context.DependentTy, VK_LValue, - BPack->getLocation()); + NestedBDs[I] = NestedBD; } + + QualType PackType = S.Context.getPackExpansionType( + S.Context.DependentTy, PackSize, /*ExpectsPackInType=*/false); + auto *PackExpr = FunctionParmPackExpr::Create( + S.Context, PackType, BPack, BPack->getBeginLoc(), NestedBDs); + BPack->setBinding(PackType, PackExpr); } if (IsValid) diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index 8c8ba1da88ebf..a8eb24133a76d 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1286,7 +1286,6 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Expr::ConvertVectorExprClass: case Expr::VAArgExprClass: case Expr::CXXParenListInitExprClass: - case Expr::ResolvedUnexpandedPackExprClass: return canSubStmtsThrow(*this, S); case Expr::CompoundLiteralExprClass: diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 5817632b61dbd..1e660d7770dc6 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -19430,7 +19430,7 @@ static ExprResult rebuildPotentialResultsAsNonOdrUsed(Sema &S, Expr *E, auto *FPPE = cast(E); // If any of the declarations in the pack is odr-used, then the expression // as a whole constitutes an odr-use. - for (VarDecl *D : *FPPE) + for (ValueDecl *D : *FPPE) if (IsPotentialResultOdrUsed(D)) return ExprEmpty(); @@ -19705,7 +19705,7 @@ void Sema::CleanupVarDeclMarking() { MarkVarDeclODRUsed(cast(ME->getMemberDecl()), ME->getMemberLoc(), *this); } else if (auto *FP = dyn_cast(E)) { - for (VarDecl *VD : *FP) + for (ValueDecl *VD : *FP) MarkVarDeclODRUsed(VD, FP->getParameterPackLocation(), *this); } else { llvm_unreachable("Unexpected expression"); @@ -20081,7 +20081,7 @@ void Sema::MarkMemberReferenced(MemberExpr *E) { } void Sema::MarkFunctionParmPackReferenced(FunctionParmPackExpr *E) { - for (VarDecl *VD : *E) + for (ValueDecl *VD : *E) MarkExprReferenced(*this, E->getParameterPackLocation(), VD, E, true, RefsMinusAssignments); } diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index d1a45af6ca58f..121da4916ed43 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1585,20 +1585,16 @@ namespace { SubstNonTypeTemplateParmExpr *E); /// Rebuild a DeclRefExpr for a VarDecl reference. - ExprResult RebuildVarDeclRefExpr(VarDecl *PD, SourceLocation Loc); + ExprResult RebuildVarDeclRefExpr(ValueDecl *PD, SourceLocation Loc); /// Transform a reference to a function or init-capture parameter pack. - ExprResult TransformFunctionParmPackRefExpr(DeclRefExpr *E, VarDecl *PD); + ExprResult TransformFunctionParmPackRefExpr(DeclRefExpr *E, ValueDecl *PD); /// Transform a FunctionParmPackExpr which was built when we couldn't /// expand a function parameter pack reference which refers to an expanded /// pack. ExprResult TransformFunctionParmPackExpr(FunctionParmPackExpr *E); - // Transform a ResolvedUnexpandedPackExpr - ExprResult - TransformResolvedUnexpandedPackExpr(ResolvedUnexpandedPackExpr *E); - QualType TransformFunctionProtoType(TypeLocBuilder &TLB, FunctionProtoTypeLoc TL) { // Call the base version; it will forward to our overridden version below. @@ -2392,7 +2388,7 @@ TemplateInstantiator::TransformSubstNonTypeTemplateParmExpr( SugaredConverted, E->getPackIndex()); } -ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(VarDecl *PD, +ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(ValueDecl *PD, SourceLocation Loc) { DeclarationNameInfo NameInfo(PD->getDeclName(), Loc); return getSema().BuildDeclarationNameExpr(CXXScopeSpec(), NameInfo, PD); @@ -2402,8 +2398,8 @@ ExprResult TemplateInstantiator::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { if (getSema().ArgumentPackSubstitutionIndex != -1) { // We can expand this parameter pack now. - VarDecl *D = E->getExpansion(getSema().ArgumentPackSubstitutionIndex); - VarDecl *VD = cast_or_null(TransformDecl(E->getExprLoc(), D)); + ValueDecl *D = E->getExpansion(getSema().ArgumentPackSubstitutionIndex); + ValueDecl *VD = cast_or_null(TransformDecl(E->getExprLoc(), D)); if (!VD) return ExprError(); return RebuildVarDeclRefExpr(VD, E->getExprLoc()); @@ -2415,11 +2411,11 @@ TemplateInstantiator::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { // Transform each of the parameter expansions into the corresponding // parameters in the instantiation of the function decl. - SmallVector Vars; + SmallVector Vars; Vars.reserve(E->getNumExpansions()); for (FunctionParmPackExpr::iterator I = E->begin(), End = E->end(); I != End; ++I) { - VarDecl *D = cast_or_null(TransformDecl(E->getExprLoc(), *I)); + ValueDecl *D = cast_or_null(TransformDecl(E->getExprLoc(), *I)); if (!D) return ExprError(); Vars.push_back(D); @@ -2434,7 +2430,7 @@ TemplateInstantiator::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { ExprResult TemplateInstantiator::TransformFunctionParmPackRefExpr(DeclRefExpr *E, - VarDecl *PD) { + ValueDecl *PD) { typedef LocalInstantiationScope::DeclArgumentPack DeclArgumentPack; llvm::PointerUnion *Found = getSema().CurrentInstantiationScope->findInstantiationOf(PD); @@ -2460,7 +2456,8 @@ TemplateInstantiator::TransformFunctionParmPackRefExpr(DeclRefExpr *E, } // We have either an unexpanded pack or a specific expansion. - return RebuildVarDeclRefExpr(cast(TransformedDecl), E->getExprLoc()); + return RebuildVarDeclRefExpr(cast(TransformedDecl), + E->getExprLoc()); } ExprResult @@ -2482,15 +2479,6 @@ TemplateInstantiator::TransformDeclRefExpr(DeclRefExpr *E) { if (PD->isParameterPack()) return TransformFunctionParmPackRefExpr(E, PD); - if (BindingDecl *BD = dyn_cast(D); BD && BD->isParameterPack()) { - BD = cast_or_null(TransformDecl(BD->getLocation(), BD)); - if (!BD) - return ExprError(); - if (auto *RP = - dyn_cast_if_present(BD->getBinding())) - return TransformResolvedUnexpandedPackExpr(RP); - } - return inherited::TransformDeclRefExpr(E); } @@ -2651,19 +2639,6 @@ TemplateInstantiator::TransformTemplateTypeParmType(TypeLocBuilder &TLB, return Result; } -ExprResult TemplateInstantiator::TransformResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - if (getSema().ArgumentPackSubstitutionIndex != -1) { - assert(static_cast(getSema().ArgumentPackSubstitutionIndex) < - E->getNumExprs() && - "ArgumentPackSubstitutionIndex is out of range"); - return TransformExpr( - E->getExpansion(getSema().ArgumentPackSubstitutionIndex)); - } - - return inherited::TransformResolvedUnexpandedPackExpr(E); -} - QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType( TypeLocBuilder &TLB, SubstTemplateTypeParmPackTypeLoc TL, bool SuppressObjCLifetime) { @@ -4680,7 +4655,7 @@ void LocalInstantiationScope::InstantiatedLocal(const Decl *D, Decl *Inst) { #endif Stored = Inst; } else if (DeclArgumentPack *Pack = dyn_cast(Stored)) { - Pack->push_back(cast(Inst)); + Pack->push_back(cast(Inst)); } else { assert(cast(Stored) == Inst && "Already instantiated this local"); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 1f42f9500959e..1cdf80898bfca 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -1179,13 +1179,13 @@ Decl *TemplateDeclInstantiator::VisitDecompositionDecl(DecompositionDecl *D) { // Transform the bindings first. // The transformed DD will have all of the concrete BindingDecls. SmallVector NewBindings; - ResolvedUnexpandedPackExpr *OldResolvedPack = nullptr; + BindingDecl *OldBindingPack = nullptr; for (auto *OldBD : D->bindings()) { Expr *BindingExpr = OldBD->getBinding(); - if (auto *RP = - dyn_cast_if_present(BindingExpr)) { - assert(!OldResolvedPack && "no more than one pack is allowed"); - OldResolvedPack = RP; + if (isa_and_present(BindingExpr)) { + // We have a resolved pack. + assert(!OldBindingPack && "no more than one pack is allowed"); + OldBindingPack = OldBD; } NewBindings.push_back(cast(VisitBindingDecl(OldBD))); } @@ -1198,25 +1198,20 @@ Decl *TemplateDeclInstantiator::VisitDecompositionDecl(DecompositionDecl *D) { for (auto *NewBD : NewBindings) NewBD->setInvalidDecl(); - if (OldResolvedPack) { - // Mark the holding vars (if any) in the pack as instantiated since - // they are created implicitly. + if (OldBindingPack) { + // Mark the bindings in the pack as instantiated. auto Bindings = NewDD->bindings(); - auto BPack = llvm::find_if( + BindingDecl *NewBindingPack = *llvm::find_if( Bindings, [](BindingDecl *D) -> bool { return D->isParameterPack(); }); - auto *NewResolvedPack = - cast((*BPack)->getBinding()); - auto OldExprs = OldResolvedPack->getExprs(); - auto NewExprs = NewResolvedPack->getExprs(); - assert(OldExprs.size() == NewExprs.size()); - for (unsigned I = 0; I < OldResolvedPack->getNumExprs(); I++) { - DeclRefExpr *OldDRE = cast(OldExprs[I]); - BindingDecl *OldNestedBD = cast(OldDRE->getDecl()); - DeclRefExpr *NewDRE = cast(NewExprs[I]); - BindingDecl *NewNestedBD = cast(NewDRE->getDecl()); - SemaRef.CurrentInstantiationScope->InstantiatedLocal(OldNestedBD, - NewNestedBD); - } + assert(NewBindingPack != nullptr && "new bindings should also have a pack"); + llvm::ArrayRef OldDecls = + OldBindingPack->getBindingPackDecls(); + llvm::ArrayRef NewDecls = + NewBindingPack->getBindingPackDecls(); + assert(OldDecls.size() == NewDecls.size()); + for (unsigned I = 0; I < OldDecls.size(); I++) + SemaRef.CurrentInstantiationScope->InstantiatedLocal(OldDecls[I], + NewDecls[I]); } return NewDD; @@ -6280,9 +6275,7 @@ NamedDecl *Sema::FindInstantiatedDecl(SourceLocation Loc, NamedDecl *D, if (auto *BD = dyn_cast(FD); BD && BD->isParameterPack() && ArgumentPackSubstitutionIndex != -1) { - auto *DRE = cast( - BD->getBindingPackExprs()[ArgumentPackSubstitutionIndex]); - return cast(DRE->getDecl()); + return BD->getBindingPackDecls()[ArgumentPackSubstitutionIndex]; } return cast(FD); } diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 3c56794722dcc..fad00f7648848 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -50,13 +50,8 @@ class CollectUnexpandedParameterPacksVisitor auto *FTD = FD ? FD->getDescribedFunctionTemplate() : nullptr; if (FTD && FTD->getTemplateParameters()->getDepth() >= DepthLimit) return; - } else if (auto *BD = dyn_cast(ND)) { - Expr *E = BD->getBinding(); - if (auto *RP = cast_if_present(E)) { - addUnexpanded(RP); - return; - } - } else if (getDepthAndIndex(ND).first >= DepthLimit) { + } else if (ND->isTemplateParameterPack() && + getDepthAndIndex(ND).first >= DepthLimit) { return; } @@ -69,10 +64,6 @@ class CollectUnexpandedParameterPacksVisitor Unexpanded.push_back({T, Loc}); } - void addUnexpanded(ResolvedUnexpandedPackExpr *E) { - Unexpanded.push_back({E, E->getBeginLoc()}); - } - public: explicit CollectUnexpandedParameterPacksVisitor( SmallVectorImpl &Unexpanded) @@ -115,12 +106,6 @@ class CollectUnexpandedParameterPacksVisitor return true; } - bool - VisitResolvedUnexpandedPackExpr(ResolvedUnexpandedPackExpr *E) override { - addUnexpanded(E); - return true; - } - /// Record occurrences of template template parameter packs. bool TraverseTemplateName(TemplateName Template) override { if (auto *TTP = dyn_cast_or_null( @@ -782,16 +767,13 @@ bool Sema::CheckParameterPacksForExpansion( unsigned Depth = 0, Index = 0; IdentifierInfo *Name; bool IsVarDeclPack = false; - ResolvedUnexpandedPackExpr *ResolvedPack = nullptr; + FunctionParmPackExpr *BindingPack = nullptr; if (const TemplateTypeParmType *TTP = ParmPack.first.dyn_cast()) { Depth = TTP->getDepth(); Index = TTP->getIndex(); Name = TTP->getIdentifier(); - } else if (auto *RP = - ParmPack.first.dyn_cast()) { - ResolvedPack = RP; } else { NamedDecl *ND = cast(ParmPack.first); if (isa(ND)) @@ -802,8 +784,8 @@ bool Sema::CheckParameterPacksForExpansion( CurrentInstantiationScope->findInstantiationOf(ND); Decl *B = cast(*Instantiation); Expr *BindingExpr = cast(B)->getBinding(); - ResolvedPack = cast_if_present(BindingExpr); - if (!ResolvedPack) { + BindingPack = cast_if_present(BindingExpr); + if (!BindingPack) { ShouldExpand = false; continue; } @@ -829,8 +811,8 @@ bool Sema::CheckParameterPacksForExpansion( ShouldExpand = false; continue; } - } else if (ResolvedPack) { - NewPackSize = ResolvedPack->getNumExprs(); + } else if (BindingPack) { + NewPackSize = BindingPack->getNumExpansions(); } else { // If we don't have a template argument at this depth/index, then we // cannot expand the pack expansion. Make a note of this, but we still @@ -867,7 +849,7 @@ bool Sema::CheckParameterPacksForExpansion( // Template argument deduction can extend the sequence of template // arguments corresponding to a template parameter pack, even when the // sequence contains explicitly specified template arguments. - if (!IsVarDeclPack && !ResolvedPack && CurrentInstantiationScope) { + if (!IsVarDeclPack && CurrentInstantiationScope) { if (NamedDecl *PartialPack = CurrentInstantiationScope->getPartiallySubstitutedPack()) { unsigned PartialDepth, PartialIndex; @@ -973,12 +955,6 @@ std::optional Sema::getNumArgumentsInExpansionFromUnexpanded( Unexpanded[I].first.dyn_cast()) { Depth = TTP->getDepth(); Index = TTP->getIndex(); - } else if (auto *PE = Unexpanded[I] - .first.dyn_cast()) { - unsigned Size = PE->getNumExprs(); - assert((!Result || *Result == Size) && "inconsistent pack sizes"); - Result = Size; - continue; } else { NamedDecl *ND = cast(Unexpanded[I].first); if (isa(ND)) { @@ -1207,12 +1183,8 @@ ExprResult Sema::ActOnSizeofParameterPackExpr(Scope *S, MarkAnyDeclReferenced(OpLoc, ParameterPack, true); - std::optional Length; - if (auto *RP = ResolvedUnexpandedPackExpr::getFromDecl(ParameterPack)) - Length = RP->getNumExprs(); - return SizeOfPackExpr::Create(Context, OpLoc, ParameterPack, NameLoc, - RParenLoc, Length); + RParenLoc); } static bool isParameterPack(Expr *PackExpression) { @@ -1360,7 +1332,7 @@ std::optional Sema::getFullyPackExpandedSize(TemplateArgument Arg) { dyn_cast(Arg.getAsExpr())) Pack = Subst->getArgumentPack(); else if (auto *Subst = dyn_cast(Arg.getAsExpr())) { - for (VarDecl *PD : *Subst) + for (ValueDecl *PD : *Subst) if (PD->isParameterPack()) return std::nullopt; return Subst->getNumExpansions(); diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 73e979927b4f3..05cac8db3c42c 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -3680,13 +3680,6 @@ class TreeTransform { FullySubstituted); } - ExprResult RebuildResolvedUnexpandedPackExpr(SourceLocation BeginLoc, - QualType T, - ArrayRef Exprs) { - return ResolvedUnexpandedPackExpr::Create(SemaRef.Context, BeginLoc, T, - Exprs); - } - /// Build a new expression representing a call to a source location /// builtin. /// @@ -16183,24 +16176,6 @@ TreeTransform::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { return E; } -template -ExprResult TreeTransform::TransformResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - bool ArgumentChanged = false; - SmallVector NewExprs; - if (TransformExprs(E->getExprs().begin(), E->getNumExprs(), - /*IsCall=*/false, NewExprs, &ArgumentChanged)) - return ExprError(); - - if (!AlwaysRebuild() && !ArgumentChanged) - return E; - - // NOTE: The type is just a superficial PackExpansionType - // that needs no substitution. - return RebuildResolvedUnexpandedPackExpr(E->getBeginLoc(), E->getType(), - NewExprs); -} - template ExprResult TreeTransform::TransformMaterializeTemporaryExpr( diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index a89eee601e437..fba54023a6bb2 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2208,16 +2208,6 @@ void ASTStmtReader::VisitPackIndexingExpr(PackIndexingExpr *E) { Exprs[I] = Record.readExpr(); } -void ASTStmtReader::VisitResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - VisitExpr(E); - E->NumExprs = Record.readInt(); - E->BeginLoc = readSourceLocation(); - auto **Exprs = E->getTrailingObjects(); - for (unsigned I = 0; I < E->NumExprs; ++I) - Exprs[I] = Record.readExpr(); -} - void ASTStmtReader::VisitSubstNonTypeTemplateParmExpr( SubstNonTypeTemplateParmExpr *E) { VisitExpr(E); @@ -2249,11 +2239,11 @@ void ASTStmtReader::VisitSubstNonTypeTemplateParmPackExpr( void ASTStmtReader::VisitFunctionParmPackExpr(FunctionParmPackExpr *E) { VisitExpr(E); E->NumParameters = Record.readInt(); - E->ParamPack = readDeclAs(); + E->ParamPack = readDeclAs(); E->NameLoc = readSourceLocation(); - auto **Parms = E->getTrailingObjects(); + auto **Parms = E->getTrailingObjects(); for (unsigned i = 0, n = E->NumParameters; i != n; ++i) - Parms[i] = readDeclAs(); + Parms[i] = readDeclAs(); } void ASTStmtReader::VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E) { @@ -4321,12 +4311,6 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { /*TransformedExprs=*/Record[ASTStmtReader::NumExprFields]); break; - case EXPR_RESOLVED_UNEXPANDED_PACK: - S = ResolvedUnexpandedPackExpr::CreateDeserialized( - Context, - /*NumExprs=*/Record[ASTStmtReader::NumExprFields]); - break; - case EXPR_SUBST_NON_TYPE_TEMPLATE_PARM: S = new (Context) SubstNonTypeTemplateParmExpr(Empty); break; diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 64791300fe722..79b777cddd0b0 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -874,7 +874,6 @@ static void AddStmtsExprs(llvm::BitstreamWriter &Stream, RECORD(EXPR_PACK_EXPANSION); RECORD(EXPR_SIZEOF_PACK); RECORD(EXPR_PACK_INDEXING); - RECORD(EXPR_RESOLVED_UNEXPANDED_PACK); RECORD(EXPR_SUBST_NON_TYPE_TEMPLATE_PARM); RECORD(EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK); RECORD(EXPR_FUNCTION_PARM_PACK); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 6a779f1618287..2687231d7820f 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2210,16 +2210,6 @@ void ASTStmtWriter::VisitPackIndexingExpr(PackIndexingExpr *E) { Code = serialization::EXPR_PACK_INDEXING; } -void ASTStmtWriter::VisitResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - VisitExpr(E); - Record.push_back(E->getNumExprs()); - Record.AddSourceLocation(E->getBeginLoc()); - for (Expr *Sub : E->getExprs()) - Record.AddStmt(Sub); - Code = serialization::EXPR_RESOLVED_UNEXPANDED_PACK; -} - void ASTStmtWriter::VisitSubstNonTypeTemplateParmExpr( SubstNonTypeTemplateParmExpr *E) { VisitExpr(E); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index d93952264a606..c3dcdc985a935 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1745,7 +1745,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::DependentCoawaitExprClass: case Stmt::CoreturnStmtClass: case Stmt::CoyieldExprClass: - case Stmt::ResolvedUnexpandedPackExprClass: case Stmt::SEHTryStmtClass: case Stmt::SEHExceptStmtClass: case Stmt::SEHLeaveStmtClass: diff --git a/clang/test/AST/ast-dump-binding-pack.cpp b/clang/test/AST/ast-dump-binding-pack.cpp index 81c75a1268730..c4a353ae72a1b 100644 --- a/clang/test/AST/ast-dump-binding-pack.cpp +++ b/clang/test/AST/ast-dump-binding-pack.cpp @@ -22,10 +22,7 @@ void foo() { // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0 // CHECK-NOT: BindingDecl // CHECK-LABEL: BindingDecl {{.*}} binding_rest -// CHECK-NEXT: ResolvedUnexpandedPackExpr -// CHECK-NEXT: DeclRefExpr {{.*}} lvalue Binding {{.*}} 'binding_rest' -// CHECK-NEXT: DeclRefExpr {{.*}} lvalue Binding {{.*}} 'binding_rest' -// CHECK-NOT: BindingDecl +// CHECK-NEXT: FunctionParmPackExpr // CHECK-LABEL: BindingDecl {{.*}} binding_4 // CHECK-NEXT: ArraySubscriptExpr // CHECK-NEXT: ImplicitCastExpr {{.*}} @@ -47,9 +44,7 @@ void bar() { // CHECK-LABEL: FunctionTemplateDecl {{.*}} bar // CHECK-NOT: BindingDecl // CHECK-LABEL: BindingDecl {{.*}} empty_binding_pack -// CHECK-NEXT: ResolvedUnexpandedPackExpr -// CHECK-NOT: DeclRefExpr {{.*}} 'empty_binding_pack' -// CHECK-NOT: BindingDecl +// CHECK-NEXT: FunctionParmPackExpr // CHECK: DeclStmt struct int_pair { int x; int y; }; @@ -67,8 +62,6 @@ void(*f)() = baz; // CHECK: BindingDecl {{.*}} binding_2 // CHECK-NOT: BindingDecl // CHECK-LABEL: BindingDecl {{.*}} empty_binding_pack -// CHECK-NEXT: ResolvedUnexpandedPackExpr -// CHECK-NOT: DeclRefExpr {{.*}} 'empty_binding_pack' -// CHECK-NOT: BindingDecl +// CHECK-NEXT: FunctionParmPackExpr // CHECK: DeclStmt #endif diff --git a/clang/test/SemaCXX/cxx2c-binding-pack.cpp b/clang/test/SemaCXX/cxx2c-binding-pack.cpp index 5ca249f52b3d8..62e1da565f2b5 100644 --- a/clang/test/SemaCXX/cxx2c-binding-pack.cpp +++ b/clang/test/SemaCXX/cxx2c-binding-pack.cpp @@ -59,6 +59,7 @@ template void decompose_struct() { T obj{1, 2, 3, 6}; auto [x, ...rest, y] = obj; + static_assert(sizeof...(rest) == 2); auto [...empty] = type_{}; static_assert(sizeof...(empty) == 0); @@ -124,6 +125,14 @@ void lambda_capture() { [&x...] { (void)sum(x...); }(); } +struct S2 { + int a, b, c; +}; + +auto X = [] () { + auto [...pack] = S2{}; +}; + int main() { decompose_array(); decompose_tuple(); @@ -133,6 +142,8 @@ int main() { lambda_capture(); lambda_capture(); lambda_capture(); + X(); + } // P1061R10 Stuff @@ -188,3 +199,22 @@ void other_main() { static_assert(f() == 2); } } // namespace + +namespace { +struct S { + int a,b,c; +}; + +clsss S2 { // expected-error{{{unknown type name 'clsss'}}} +public: + int a,b,c; +}; + +// Should not crash. +auto X = [] () { + auto [...pack,a,b,c] = S{}; + auto [x,y,z,...pack2] = S{}; + auto [...pack3] = S2{}; + static_assert(sizeof...(pack3) == 5); +}; +} // namespace diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp index 9ca0ce36bb7f2..0810c38bb751b 100644 --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -338,7 +338,6 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent, case Stmt::EmbedExprClass: case Stmt::HLSLOutArgExprClass: case Stmt::OpenACCAsteriskSizeExprClass: - case Stmt::ResolvedUnexpandedPackExprClass: K = CXCursor_UnexposedExpr; break; From 252c83bc9ef9fc885e9e6517f1b4423188bb919f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?= Date: Tue, 18 Feb 2025 09:52:31 +0100 Subject: [PATCH 093/127] [clang][Sema] Fix type of an statement expression ending with an atomic type (#119711) When a statement expression's last statement is an atomic variable, GCC and Clang disagree on the type of the expression. This can be made apparent using `typeof` and forcing a diagnostic message: ```cpp _Atomic int a = 0; typeof(({a;})) x = "0"; ``` * GCC complains about initializing `int` with `char*` * Clang complains about initializing `_Atomic(int)` with a `char[2]` Due to the type of the statement expression being deduced to be atomic, we end with three implicit casts inside the `StmtExpr` on the AST: * `LValueToRValue` -> `AtomicToNonAtomic` -> `NonAtomicToAtomic` In some situations, this can end on an assertion inside `IntExprEvaluator`, as reported in #106576. With this patch, we now have two implicit casts, since the type of the statement expression is deduced to be non-atomic: * `LValueToRValue` -> `AtomicToNonAtomic` This is consistent with the C standard (6.7.2.4, p4) > The properties associated with atomic types are meaningful only for expressions that are lvalues. But a statement expression is an rvalue. `IntExprEvaluator` assumptions are now satisfied and there is no assertion error. Additionally, the `typeof` trick mentioned above shows that the type is consistently deduced between GCC and Clang. Fixes #106576 --------- Co-authored-by: John McCall --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaExpr.cpp | 2 +- clang/test/Sema/gh106576.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 clang/test/Sema/gh106576.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6272f32fa845a..a91c764860ccd 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -173,6 +173,7 @@ Bug Fixes to C++ Support Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ +- Fixed type checking when a statement expression ends in an l-value of atomic type. (#GH106576) Miscellaneous Bug Fixes ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 1e660d7770dc6..fad15bf95c415 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -15949,7 +15949,7 @@ ExprResult Sema::ActOnStmtExprResult(ExprResult ER) { // FIXME: Provide a better location for the initialization. return PerformCopyInitialization( InitializedEntity::InitializeStmtExprResult( - E->getBeginLoc(), E->getType().getUnqualifiedType()), + E->getBeginLoc(), E->getType().getAtomicUnqualifiedType()), SourceLocation(), E); } diff --git a/clang/test/Sema/gh106576.c b/clang/test/Sema/gh106576.c new file mode 100644 index 0000000000000..a72592aac0129 --- /dev/null +++ b/clang/test/Sema/gh106576.c @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +typedef _Atomic char atomic_char; + +atomic_char counter; + +char load_plus_one(void) { + return ({counter;}) + 1; // no crash +} + +char type_of_stmt_expr(void) { + typeof(({counter;})) y = ""; // expected-error-re {{incompatible pointer to integer conversion initializing 'typeof (({{{.*}}}))' (aka 'char') with an expression of type 'char[1]'}} + return y; +} From 88dd372d673c7e6967c93aa2879f0ef04fc7ac20 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Tue, 18 Feb 2025 09:02:29 +0000 Subject: [PATCH 094/127] [flang][Lower][OpenMP] Don't read moldarg for static sized array (#125901) This should further reduce the number of spurious barriers --- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 5 +- .../Lower/OpenMP/PrivateReductionUtils.cpp | 61 +++++++++++++------ .../lib/Lower/OpenMP/PrivateReductionUtils.h | 6 +- .../OpenMP/delayed-privatization-array.f90 | 7 +-- 4 files changed, 51 insertions(+), 28 deletions(-) diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index d13f101f516e7..d725dfd3e94f3 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -508,6 +508,8 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym, lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym); assert(hsb && "Host symbol box not found"); + hlfir::Entity entity{hsb.getAddr()}; + bool cannotHaveNonDefaultLowerBounds = !entity.mayHaveNonDefaultLowerBounds(); mlir::Location symLoc = hsb.getAddr().getLoc(); std::string privatizerName = sym->name().ToString() + ".privatizer"; @@ -528,7 +530,6 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym, // an alloca for a fir.array type there. Get around this by boxing all // arrays. if (mlir::isa(allocType)) { - hlfir::Entity entity{hsb.getAddr()}; entity = genVariableBox(symLoc, firOpBuilder, entity); privVal = entity.getBase(); allocType = privVal.getType(); @@ -590,7 +591,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym, result.getDeallocRegion(), isFirstPrivate ? DeclOperationKind::FirstPrivate : DeclOperationKind::Private, - sym); + sym, cannotHaveNonDefaultLowerBounds); // TODO: currently there are false positives from dead uses of the mold // arg if (!result.getInitMoldArg().getUses().empty()) diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp index 22cd0679050db..21ade77d82d37 100644 --- a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp +++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp @@ -122,25 +122,40 @@ static void createCleanupRegion(Fortran::lower::AbstractConverter &converter, typeError(); } -fir::ShapeShiftOp Fortran::lower::omp::getShapeShift(fir::FirOpBuilder &builder, - mlir::Location loc, - mlir::Value box) { +fir::ShapeShiftOp +Fortran::lower::omp::getShapeShift(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value box, + bool cannotHaveNonDefaultLowerBounds) { fir::SequenceType sequenceType = mlir::cast( hlfir::getFortranElementOrSequenceType(box.getType())); const unsigned rank = sequenceType.getDimension(); + llvm::SmallVector lbAndExtents; lbAndExtents.reserve(rank * 2); - mlir::Type idxTy = builder.getIndexType(); - for (unsigned i = 0; i < rank; ++i) { - // TODO: ideally we want to hoist box reads out of the critical section. - // We could do this by having box dimensions in block arguments like - // OpenACC does - mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i); - auto dimInfo = - builder.create(loc, idxTy, idxTy, idxTy, box, dim); - lbAndExtents.push_back(dimInfo.getLowerBound()); - lbAndExtents.push_back(dimInfo.getExtent()); + + if (cannotHaveNonDefaultLowerBounds && !sequenceType.hasDynamicExtents()) { + // We don't need fir::BoxDimsOp if all of the extents are statically known + // and we can assume default lower bounds. This helps avoids reads from the + // mold arg. + mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); + for (int64_t extent : sequenceType.getShape()) { + assert(extent != sequenceType.getUnknownExtent()); + mlir::Value extentVal = builder.createIntegerConstant(loc, idxTy, extent); + lbAndExtents.push_back(one); + lbAndExtents.push_back(extentVal); + } + } else { + for (unsigned i = 0; i < rank; ++i) { + // TODO: ideally we want to hoist box reads out of the critical section. + // We could do this by having box dimensions in block arguments like + // OpenACC does + mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i); + auto dimInfo = + builder.create(loc, idxTy, idxTy, idxTy, box, dim); + lbAndExtents.push_back(dimInfo.getLowerBound()); + lbAndExtents.push_back(dimInfo.getExtent()); + } } auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank); @@ -248,12 +263,13 @@ class PopulateInitAndCleanupRegionsHelper { mlir::Type argType, mlir::Value scalarInitValue, mlir::Value allocatedPrivVarArg, mlir::Value moldArg, mlir::Block *initBlock, mlir::Region &cleanupRegion, - DeclOperationKind kind, const Fortran::semantics::Symbol *sym) + DeclOperationKind kind, const Fortran::semantics::Symbol *sym, + bool cannotHaveLowerBounds) : converter{converter}, builder{converter.getFirOpBuilder()}, loc{loc}, argType{argType}, scalarInitValue{scalarInitValue}, allocatedPrivVarArg{allocatedPrivVarArg}, moldArg{moldArg}, initBlock{initBlock}, cleanupRegion{cleanupRegion}, kind{kind}, - sym{sym} { + sym{sym}, cannotHaveNonDefaultLowerBounds{cannotHaveLowerBounds} { valType = fir::unwrapRefType(argType); } @@ -295,6 +311,10 @@ class PopulateInitAndCleanupRegionsHelper { /// Any length parameters which have been fetched for the type mlir::SmallVector lenParams; + /// If the source variable being privatized definitely can't have non-default + /// lower bounds then we don't need to generate code to read them. + bool cannotHaveNonDefaultLowerBounds; + void createYield(mlir::Value ret) { builder.create(loc, ret); } @@ -432,7 +452,8 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( // Special case for (possibly allocatable) arrays of polymorphic types // e.g. !fir.class>>> if (source.isPolymorphic()) { - fir::ShapeShiftOp shape = getShapeShift(builder, loc, source); + fir::ShapeShiftOp shape = + getShapeShift(builder, loc, source, cannotHaveNonDefaultLowerBounds); mlir::Type arrayType = source.getElementOrSequenceType(); mlir::Value allocatedArray = builder.create( loc, arrayType, /*typeparams=*/mlir::ValueRange{}, shape.getExtents()); @@ -471,8 +492,8 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( // Put the temporary inside of a box: // hlfir::genVariableBox doesn't handle non-default lower bounds mlir::Value box; - fir::ShapeShiftOp shapeShift = - getShapeShift(builder, loc, getLoadedMoldArg()); + fir::ShapeShiftOp shapeShift = getShapeShift(builder, loc, getLoadedMoldArg(), + cannotHaveNonDefaultLowerBounds); mlir::Type boxType = getLoadedMoldArg().getType(); if (mlir::isa(temp.getType())) // the box created by the declare form createTempFromMold is missing @@ -607,10 +628,10 @@ void Fortran::lower::omp::populateByRefInitAndCleanupRegions( mlir::Type argType, mlir::Value scalarInitValue, mlir::Block *initBlock, mlir::Value allocatedPrivVarArg, mlir::Value moldArg, mlir::Region &cleanupRegion, DeclOperationKind kind, - const Fortran::semantics::Symbol *sym) { + const Fortran::semantics::Symbol *sym, bool cannotHaveLowerBounds) { PopulateInitAndCleanupRegionsHelper helper( converter, loc, argType, scalarInitValue, allocatedPrivVarArg, moldArg, - initBlock, cleanupRegion, kind, sym); + initBlock, cleanupRegion, kind, sym, cannotHaveLowerBounds); helper.populateByRefInitAndCleanupRegions(); // Often we load moldArg to check something (e.g. length parameters, shape) diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.h b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h index fcd36392a29e0..0a3513bff19b0 100644 --- a/flang/lib/Lower/OpenMP/PrivateReductionUtils.h +++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h @@ -55,11 +55,13 @@ void populateByRefInitAndCleanupRegions( mlir::Value scalarInitValue, mlir::Block *initBlock, mlir::Value allocatedPrivVarArg, mlir::Value moldArg, mlir::Region &cleanupRegion, DeclOperationKind kind, - const Fortran::semantics::Symbol *sym = nullptr); + const Fortran::semantics::Symbol *sym = nullptr, + bool cannotHaveNonDefaultLowerBounds = false); /// Generate a fir::ShapeShift op describing the provided boxed array. fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value box); + mlir::Value box, + bool cannotHaveNonDefaultLowerBounds = false); } // namespace omp } // namespace lower diff --git a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-array.f90 index 95fa3f9e03052..c447fa6f27a75 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-array.f90 @@ -108,15 +108,14 @@ program main ! ONE_DIM_DEFAULT_LB-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box>]] init { ! ONE_DIM_DEFAULT_LB-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref>>]], %[[PRIV_BOX_ALLOC:.*]]: [[TYPE]]): -! ONE_DIM_DEFAULT_LB-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] ! ONE_DIM_DEFAULT_LB-NEXT: %[[C10:.*]] = arith.constant 10 : index ! ONE_DIM_DEFAULT_LB-NEXT: %[[SHAPE:.*]] = fir.shape %[[C10]] ! ONE_DIM_DEFAULT_LB-NEXT: %[[ARRAY_ALLOC:.*]] = fir.allocmem !fir.array<10xi32> ! ONE_DIM_DEFAULT_LB-NEXT: %[[TRUE:.*]] = arith.constant true ! ONE_DIM_DEFAULT_LB-NEXT: %[[DECL:.*]]:2 = hlfir.declare %[[ARRAY_ALLOC]](%[[SHAPE]]) -! ONE_DIM_DEFAULT_LB-NEXT: %[[C0_0:.*]] = arith.constant 0 -! ONE_DIM_DEFAULT_LB-NEXT: %[[DIMS2:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0_0]] -! ONE_DIM_DEFAULT_LB-NEXT: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS2]]#0, %[[DIMS2]]#1 +! ONE_DIM_DEFAULT_LB-NEXT: %[[ONE:.*]] = arith.constant 1 : index +! ONE_DIM_DEFAULT_LB-NEXT: %[[TEN:.*]] = arith.constant 10 : index +! ONE_DIM_DEFAULT_LB-NEXT: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[ONE]], %[[TEN]] ! ONE_DIM_DEFAULT_LB-NEXT: %[[EMBOX:.*]] = fir.embox %[[DECL]]#0(%[[SHAPE_SHIFT]]) ! ONE_DIM_DEFAULT_LB-NEXT: fir.store %[[EMBOX]] to %[[PRIV_BOX_ALLOC]] ! ONE_DIM_DEFAULT_LB-NEXT: omp.yield(%[[PRIV_BOX_ALLOC]] : [[TYPE]]) From d4a0848dc6678bc4ce8c74855a818dfc8c30a088 Mon Sep 17 00:00:00 2001 From: James Chesterman Date: Tue, 18 Feb 2025 09:08:47 +0000 Subject: [PATCH 095/127] [SelectionDAG] Add PARTIAL_REDUCE_U/SMLA ISD Nodes (#125207) Add signed and unsigned PARTIAL_REDUCE_MLA ISD nodes. Add command line argument (aarch64-enable-partial-reduce-nodes) that indicates whether the intrinsic experimental_vector_partial_ reduce_add will be transformed into the new ISD node. Lowering with the new ISD nodes will, for now, always be done as an expand. --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 17 + llvm/include/llvm/CodeGen/SelectionDAG.h | 5 - llvm/include/llvm/CodeGen/TargetLowering.h | 4 + .../SelectionDAG/LegalizeIntegerTypes.cpp | 30 + llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 + .../SelectionDAG/LegalizeVectorOps.cpp | 6 + .../SelectionDAG/LegalizeVectorTypes.cpp | 19 + .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 51 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 10 +- .../SelectionDAG/SelectionDAGDumper.cpp | 5 + .../CodeGen/SelectionDAG/TargetLowering.cpp | 52 ++ llvm/lib/CodeGen/TargetLoweringBase.cpp | 4 + .../Target/AArch64/AArch64ISelLowering.cpp | 16 +- .../neon-partial-reduce-dot-product.ll | 1 + .../AArch64/sve-partial-reduce-dot-product.ll | 731 ++++++++++++++++++ .../AArch64/sve-partial-reduce-wide-add.ll | 49 ++ 16 files changed, 963 insertions(+), 41 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 68ed812222dfd..665c4d6baad80 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1459,6 +1459,23 @@ enum NodeType { VECREDUCE_UMAX, VECREDUCE_UMIN, + // PARTIAL_REDUCE_[U|S]MLA(Accumulator, Input1, Input2) + // The partial reduction nodes sign or zero extend Input1 and Input2 to the + // element type of Accumulator before multiplying their results. + // This result is concatenated to the Accumulator, and this is then reduced, + // using addition, to the result type. + // The output is only expected to either be given to another partial reduction + // operation or an equivalent vector reduce operation, so the order in which + // the elements are reduced is deliberately not specified. + // Input1 and Input2 must be the same type. Accumulator and the output must be + // the same type. + // The number of elements in Input1 and Input2 must be a positive integer + // multiple of the number of elements in the Accumulator / output type. + // Input1 and Input2 must have an element type which is the same as or smaller + // than the element type of the Accumulator and output. + PARTIAL_REDUCE_SMLA, + PARTIAL_REDUCE_UMLA, + // The `llvm.experimental.stackmap` intrinsic. // Operands: input chain, glue, , , [live0[, live1...]] // Outputs: output chain, glue diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 461c0c1ead16d..cf8e4a3d2513b 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1607,11 +1607,6 @@ class SelectionDAG { /// the target's desired shift amount type. SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op); - /// Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are - /// its operands and ReducedTY is the intrinsic's return type. - SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, - SDValue Op2); - /// Expands a node with multiple results to an FP or vector libcall. The /// libcall is expected to take all the operands of the \p Node followed by /// output pointers for each of the results. \p CallRetResNo can be optionally diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index bbecc7a6ddaee..a4c3d042fe3a4 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5564,6 +5564,10 @@ class TargetLowering : public TargetLoweringBase { /// temporarily, advance store position, before re-loading the final vector. SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const; + /// Expands PARTIAL_REDUCE_S/UMLA nodes to a series of simpler operations, + /// consisting of zext/sext, extract_subvector, mul and add operations. + SDValue expandPartialReduceMLA(SDNode *Node, SelectionDAG &DAG) const; + /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC /// on the current target. A VP_SETCC will additionally be given a Mask /// and/or EVL not equal to SDValue(). diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a0f29496df777..204b323d7084a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -159,6 +159,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(N); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = PromoteIntRes_PARTIAL_REDUCE_MLA(N); + break; + case ISD::SIGN_EXTEND: case ISD::VP_SIGN_EXTEND: case ISD::ZERO_EXTEND: @@ -2099,6 +2104,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::VECTOR_FIND_LAST_ACTIVE: Res = PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(N, OpNo); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = PromoteIntOp_PARTIAL_REDUCE_MLA(N); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -2881,6 +2890,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N) { + SmallVector NewOps(N->ops()); + if (N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA) { + NewOps[1] = SExtPromotedInteger(N->getOperand(1)); + NewOps[2] = SExtPromotedInteger(N->getOperand(2)); + } else { + NewOps[1] = ZExtPromotedInteger(N->getOperand(1)); + NewOps[2] = ZExtPromotedInteger(N->getOperand(2)); + } + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// @@ -6200,6 +6221,15 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N) { return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, SDLoc(N), NVT, N->ops()); } +SDValue DAGTypeLegalizer::PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue ExtAcc = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), DL, NVT, ExtAcc, N->getOperand(1), + N->getOperand(2)); +} + SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b58c160b5c8b8..69c687a797485 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -379,6 +379,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); SDValue PromoteIntRes_PATCHPOINT(SDNode *N); SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); + SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -430,6 +431,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N); void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -969,6 +971,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue &Hi); // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); @@ -1000,6 +1003,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N); SDValue SplitVecOp_VP_CttzElements(SDNode *N); SDValue SplitVecOp_VECTOR_HISTOGRAM(SDNode *N); + SDValue SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Support: LegalizeVectorTypes.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 7e8bae4b0f785..de4447fb0cf1a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -469,6 +469,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; case ISD::SMULFIX: @@ -1197,6 +1199,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::VECREDUCE_FMINIMUM: Results.push_back(TLI.expandVecReduce(Node, DAG)); return; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Results.push_back(TLI.expandPartialReduceMLA(Node, DAG)); + return; case ISD::VECREDUCE_SEQ_FADD: case ISD::VECREDUCE_SEQ_FMUL: Results.push_back(TLI.expandVecReduceSeq(Node, DAG)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1d8bf5427156e..9d42ec2fdf859 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1395,6 +1395,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::EXPERIMENTAL_VP_REVERSE: SplitVecRes_VP_REVERSE(N, Lo, Hi); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -3213,6 +3217,13 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, std::tie(Lo, Hi) = DAG.SplitVector(Load, DL); } +void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc DL(N); + SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG); + std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL); +} + void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) { unsigned Factor = N->getNumOperands(); @@ -3431,6 +3442,10 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: Res = SplitVecOp_VECTOR_HISTOGRAM(N); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = SplitVecOp_PARTIAL_REDUCE_MLA(N); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -4485,6 +4500,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) { MMO, IndexType); } +SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) { + return TLI.expandPartialReduceMLA(N, DAG); +} + //===----------------------------------------------------------------------===// // Result Vector Widening //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9d2f87497d6fa..80c2de1d99542 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2474,35 +2474,6 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { return getZExtOrTrunc(Op, SDLoc(Op), ShTy); } -SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, - SDValue Op2) { - EVT FullTy = Op2.getValueType(); - - unsigned Stride = ReducedTy.getVectorMinNumElements(); - unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride; - - // Collect all of the subvectors - std::deque Subvectors = {Op1}; - for (unsigned I = 0; I < ScaleFactor; I++) { - auto SourceIndex = getVectorIdxConstant(I * Stride, DL); - Subvectors.push_back( - getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Op2, SourceIndex})); - } - - // Flatten the subvector tree - while (Subvectors.size() > 1) { - Subvectors.push_back( - getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]})); - Subvectors.pop_front(); - Subvectors.pop_front(); - } - - assert(Subvectors.size() == 1 && - "There should only be one subvector after tree flattening"); - - return Subvectors[0]; -} - /// Given a store node \p StoreNode, return true if it is safe to fold that node /// into \p FPNode, which expands to a library call with output pointers. static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode, @@ -7883,6 +7854,28 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; } + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: { + [[maybe_unused]] EVT AccVT = N1.getValueType(); + [[maybe_unused]] EVT Input1VT = N2.getValueType(); + [[maybe_unused]] EVT Input2VT = N3.getValueType(); + assert(Input1VT.isVector() && Input1VT == Input2VT && + "Expected the second and third operands of the PARTIAL_REDUCE_MLA " + "node to have the same type!"); + assert(VT.isVector() && VT == AccVT && + "Expected the first operand of the PARTIAL_REDUCE_MLA node to have " + "the same type as its result!"); + assert(Input1VT.getVectorElementCount().hasKnownScalarFactor( + AccVT.getVectorElementCount()) && + "Expected the element count of the second and third operands of the " + "PARTIAL_REDUCE_MLA node to be a positive integer multiple of the " + "element count of the first operand and the result!"); + assert(N2.getScalarValueSizeInBits() <= N1.getScalarValueSizeInBits() && + "Expected the second and third operands of the PARTIAL_REDUCE_MLA " + "node to have an element type which is the same as or smaller than " + "the element type of the first operand and result!"); + break; + } } // Memoize node if it doesn't produce a glue result. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 78a6e24e5b8d2..1c58a7f05446c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8115,15 +8115,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::experimental_vector_partial_reduce_add: { - if (!TLI.shouldExpandPartialReductionIntrinsic(cast(&I))) { visitTargetIntrinsic(I, Intrinsic); return; } - - setValue(&I, DAG.getPartialReduceAdd(sdl, EVT::getEVT(I.getType()), - getValue(I.getOperand(0)), - getValue(I.getOperand(1)))); + SDValue Acc = getValue(I.getOperand(0)); + SDValue Input = getValue(I.getOperand(1)); + setValue(&I, + DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, sdl, Acc.getValueType(), Acc, + Input, DAG.getConstant(1, sdl, Input.getValueType()))); return; } case Intrinsic::experimental_cttz_elts: { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 8de537173e52c..8457bee3f665b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -569,6 +569,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::VECTOR_FIND_LAST_ACTIVE: return "find_last_active"; + case ISD::PARTIAL_REDUCE_UMLA: + return "partial_reduce_umla"; + case ISD::PARTIAL_REDUCE_SMLA: + return "partial_reduce_smla"; + // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ case ISD::SDID: \ diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index adfb96041c5c0..7771958f5adc9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include +#include using namespace llvm; /// NOTE: The TargetMachine owns TLOF. @@ -11890,6 +11891,57 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node, return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); } +SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, + SelectionDAG &DAG) const { + SDLoc DL(N); + SDValue Acc = N->getOperand(0); + SDValue MulLHS = N->getOperand(1); + SDValue MulRHS = N->getOperand(2); + EVT AccVT = Acc.getValueType(); + EVT MulOpVT = MulLHS.getValueType(); + + EVT ExtMulOpVT = + EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(), + MulOpVT.getVectorElementCount()); + unsigned ExtOpc = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA + ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + + if (ExtMulOpVT != MulOpVT) { + MulLHS = DAG.getNode(ExtOpc, DL, ExtMulOpVT, MulLHS); + MulRHS = DAG.getNode(ExtOpc, DL, ExtMulOpVT, MulRHS); + } + SDValue Input = MulLHS; + APInt ConstantOne; + if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) || + !ConstantOne.isOne()) + Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS); + + unsigned Stride = AccVT.getVectorMinNumElements(); + unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride; + + // Collect all of the subvectors + std::deque Subvectors = {Acc}; + for (unsigned I = 0; I < ScaleFactor; I++) { + auto SourceIndex = DAG.getVectorIdxConstant(I * Stride, DL); + Subvectors.push_back( + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, AccVT, {Input, SourceIndex})); + } + + // Flatten the subvector tree + while (Subvectors.size() > 1) { + Subvectors.push_back( + DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]})); + Subvectors.pop_front(); + Subvectors.pop_front(); + } + + assert(Subvectors.size() == 1 && + "There should only be one subvector after tree flattening"); + + return Subvectors[0]; +} + bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC, SDValue Mask, diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 51cde7ce139e2..f5ea3c0b47d6a 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -835,6 +835,10 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); setOperationAction(ISD::RESET_FPENV, VT, Expand); + + // PartialReduceMLA operations default to expand. + setOperationAction({ISD::PARTIAL_REDUCE_UMLA, ISD::PARTIAL_REDUCE_SMLA}, VT, + Expand); } // Most targets ignore the @llvm.prefetch intrinsic. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d47a0bfa4fc50..50be082777835 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -154,6 +154,13 @@ cl::opt EnableSVEGISel( cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false)); +// FIXME : This is a temporary flag, and is used to help transition to +// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD +// nodes. +static cl::opt EnablePartialReduceNodes( + "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden, + cl::desc("Use the new method of lowering partial reductions.")); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -2050,6 +2057,8 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic( const IntrinsicInst *I) const { if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add) return true; + if (EnablePartialReduceNodes) + return true; EVT VT = EVT::getEVT(I->getType()); auto Op1 = I->getOperand(1); @@ -21978,8 +21987,11 @@ static SDValue performIntrinsicCombine(SDNode *N, return Dot; if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG)) return WideAdd; - return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2)); + SDLoc DL(N); + SDValue Input = N->getOperand(2); + return DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, DL, N->getValueType(0), + N->getOperand(1), Input, + DAG.getConstant(1, DL, Input.getValueType())); } case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index 9ece9edb84343..40daf8ffb63ea 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM ; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM +; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; CHECK-DOT-LABEL: udot: diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index 66f83c658ff4f..455231dd37be6 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -1,12 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM +; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING define @udot( %acc, %a, %b) { ; CHECK-LABEL: udot: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: udot z0.s, z1.b, z2.b ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = zext %b to @@ -20,6 +44,29 @@ define @udot_wide( %acc, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: udot z0.d, z1.h, z2.h ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = zext %b to @@ -33,6 +80,29 @@ define @sdot( %accc, %a, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sdot z0.s, z1.b, z2.b ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to %b.wide = sext %b to @@ -46,6 +116,29 @@ define @sdot_wide( %acc, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sdot z0.d, z1.h, z2.h ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to %b.wide = sext %b to @@ -82,6 +175,29 @@ define @usdot( %acc, %a, ; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s ; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOI8MM-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: usdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = sext %b to @@ -118,6 +234,29 @@ define @sudot( %acc, %a, ; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s ; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOI8MM-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sudot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to %b.wide = zext %b to @@ -136,6 +275,63 @@ define @udot_8to64( %acc, %a to %b.wide = zext %b to @@ -155,6 +351,63 @@ define @sdot_8to64( %acc, %a to %b.wide = sext %b to @@ -231,6 +484,63 @@ define @usdot_8to64( %acc, %a to %b.wide = sext %b to @@ -307,6 +617,63 @@ define @sudot_8to64( %acc, %a to %b.wide = zext %b to @@ -322,6 +689,20 @@ define @udot_no_bin_op( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %acc, %a.ext) ret %partial.reduce @@ -333,6 +714,20 @@ define @sdot_no_bin_op( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %acc, %a.ext) ret %partial.reduce @@ -344,6 +739,20 @@ define @udot_no_bin_op_wide( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %acc, %a.wide) @@ -356,6 +765,20 @@ define @sdot_no_bin_op_wide( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %acc, %a.wide) @@ -373,6 +796,32 @@ define @udot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) ret %partial.reduce @@ -389,6 +838,32 @@ define @sdot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) ret %partial.reduce @@ -407,6 +882,19 @@ define @not_udot( %acc, % ; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s ; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_udot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = zext %b to @@ -428,6 +916,19 @@ define @not_udot_wide( %acc, %a to %b.wide = zext %b to @@ -459,6 +960,29 @@ define @not_usdot( %acc, ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_usdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = sext %b to @@ -490,6 +1014,29 @@ define @not_sudot( %acc, ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_sudot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to %b.wide = zext %b to @@ -522,6 +1069,30 @@ define @udot_different_types( %acc, %a to %b.wide = zext %b to @@ -555,6 +1126,31 @@ define @sdot_different_types( %acc, %a to %b.wide = sext %b to @@ -588,6 +1184,31 @@ define @usdot_different_types( %acc, %a to %b.wide = sext %b to @@ -620,6 +1241,30 @@ define @sudot_different_types( %acc, %a to %b.wide = zext %b to @@ -627,3 +1272,89 @@ entry: %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %acc, %mult) ret %partial.reduce } + +define @udot_nxv8i8_promote ( %acc, %a, %b){ +; CHECK-LABEL: udot_nxv8i8_promote: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z3.d, z2.s +; CHECK-NEXT: uunpklo z4.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEWLOWERING-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret +entry: + %a.wide = zext %a to + %b.wide = zext %b to + %mult = mul nuw nsw %a.wide, %b.wide + %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16( %acc, %mult) + ret %partial.reduce +} + +define @sdot_nxv8i8_promote ( %acc, %a, %b){ +; CHECK-LABEL: sdot_nxv8i8_promote: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEXT: sxtb z2.h, p0/m, z2.h +; CHECK-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z3.d, z2.s +; CHECK-NEXT: uunpklo z4.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: ptrue p0.h +; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h +; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEWLOWERING-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret +entry: + %a.wide = sext %a to + %b.wide = sext %b to + %mult = mul nuw nsw %a.wide, %b.wide + %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16( %acc, %mult) + ret %partial.reduce +} diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll index b4b946c68566e..11fb60ead4fb2 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2 ; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE +; RUN: llc -mtriple=aarch64 -mattr=+sve2 -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEWLOWERING define @signed_wide_add_nxv4i32( %acc, %input){ ; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32: @@ -16,6 +17,14 @@ define @signed_wide_add_nxv4i32( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64( %acc, %input.wide) @@ -36,6 +45,14 @@ define @unsigned_wide_add_nxv4i32( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64( %acc, %input.wide) @@ -56,6 +73,14 @@ define @signed_wide_add_nxv8i16( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32( %acc, %input.wide) @@ -76,6 +101,14 @@ define @unsigned_wide_add_nxv8i16( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32( %acc, %input.wide) @@ -96,6 +129,14 @@ define @signed_wide_add_nxv16i8( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16( %acc, %input.wide) @@ -116,6 +157,14 @@ define @unsigned_wide_add_nxv16i8( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16( %acc, %input.wide) From 22ef210100ca9ccfee6198a18fa0aae62950f481 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Tue, 18 Feb 2025 09:12:36 +0000 Subject: [PATCH 096/127] Revert "[flang][Lower][OpenMP] Don't read moldarg for static sized array" (#127596) Reverts llvm/llvm-project#125901 Revert until I have fixed bot failures --- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 5 +- .../Lower/OpenMP/PrivateReductionUtils.cpp | 61 ++++++------------- .../lib/Lower/OpenMP/PrivateReductionUtils.h | 6 +- .../OpenMP/delayed-privatization-array.f90 | 7 ++- 4 files changed, 28 insertions(+), 51 deletions(-) diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index d725dfd3e94f3..d13f101f516e7 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -508,8 +508,6 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym, lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym); assert(hsb && "Host symbol box not found"); - hlfir::Entity entity{hsb.getAddr()}; - bool cannotHaveNonDefaultLowerBounds = !entity.mayHaveNonDefaultLowerBounds(); mlir::Location symLoc = hsb.getAddr().getLoc(); std::string privatizerName = sym->name().ToString() + ".privatizer"; @@ -530,6 +528,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym, // an alloca for a fir.array type there. Get around this by boxing all // arrays. if (mlir::isa(allocType)) { + hlfir::Entity entity{hsb.getAddr()}; entity = genVariableBox(symLoc, firOpBuilder, entity); privVal = entity.getBase(); allocType = privVal.getType(); @@ -591,7 +590,7 @@ void DataSharingProcessor::doPrivatize(const semantics::Symbol *sym, result.getDeallocRegion(), isFirstPrivate ? DeclOperationKind::FirstPrivate : DeclOperationKind::Private, - sym, cannotHaveNonDefaultLowerBounds); + sym); // TODO: currently there are false positives from dead uses of the mold // arg if (!result.getInitMoldArg().getUses().empty()) diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp index 21ade77d82d37..22cd0679050db 100644 --- a/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp +++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.cpp @@ -122,40 +122,25 @@ static void createCleanupRegion(Fortran::lower::AbstractConverter &converter, typeError(); } -fir::ShapeShiftOp -Fortran::lower::omp::getShapeShift(fir::FirOpBuilder &builder, - mlir::Location loc, mlir::Value box, - bool cannotHaveNonDefaultLowerBounds) { +fir::ShapeShiftOp Fortran::lower::omp::getShapeShift(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value box) { fir::SequenceType sequenceType = mlir::cast( hlfir::getFortranElementOrSequenceType(box.getType())); const unsigned rank = sequenceType.getDimension(); - llvm::SmallVector lbAndExtents; lbAndExtents.reserve(rank * 2); - mlir::Type idxTy = builder.getIndexType(); - if (cannotHaveNonDefaultLowerBounds && !sequenceType.hasDynamicExtents()) { - // We don't need fir::BoxDimsOp if all of the extents are statically known - // and we can assume default lower bounds. This helps avoids reads from the - // mold arg. - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - for (int64_t extent : sequenceType.getShape()) { - assert(extent != sequenceType.getUnknownExtent()); - mlir::Value extentVal = builder.createIntegerConstant(loc, idxTy, extent); - lbAndExtents.push_back(one); - lbAndExtents.push_back(extentVal); - } - } else { - for (unsigned i = 0; i < rank; ++i) { - // TODO: ideally we want to hoist box reads out of the critical section. - // We could do this by having box dimensions in block arguments like - // OpenACC does - mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i); - auto dimInfo = - builder.create(loc, idxTy, idxTy, idxTy, box, dim); - lbAndExtents.push_back(dimInfo.getLowerBound()); - lbAndExtents.push_back(dimInfo.getExtent()); - } + mlir::Type idxTy = builder.getIndexType(); + for (unsigned i = 0; i < rank; ++i) { + // TODO: ideally we want to hoist box reads out of the critical section. + // We could do this by having box dimensions in block arguments like + // OpenACC does + mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i); + auto dimInfo = + builder.create(loc, idxTy, idxTy, idxTy, box, dim); + lbAndExtents.push_back(dimInfo.getLowerBound()); + lbAndExtents.push_back(dimInfo.getExtent()); } auto shapeShiftTy = fir::ShapeShiftType::get(builder.getContext(), rank); @@ -263,13 +248,12 @@ class PopulateInitAndCleanupRegionsHelper { mlir::Type argType, mlir::Value scalarInitValue, mlir::Value allocatedPrivVarArg, mlir::Value moldArg, mlir::Block *initBlock, mlir::Region &cleanupRegion, - DeclOperationKind kind, const Fortran::semantics::Symbol *sym, - bool cannotHaveLowerBounds) + DeclOperationKind kind, const Fortran::semantics::Symbol *sym) : converter{converter}, builder{converter.getFirOpBuilder()}, loc{loc}, argType{argType}, scalarInitValue{scalarInitValue}, allocatedPrivVarArg{allocatedPrivVarArg}, moldArg{moldArg}, initBlock{initBlock}, cleanupRegion{cleanupRegion}, kind{kind}, - sym{sym}, cannotHaveNonDefaultLowerBounds{cannotHaveLowerBounds} { + sym{sym} { valType = fir::unwrapRefType(argType); } @@ -311,10 +295,6 @@ class PopulateInitAndCleanupRegionsHelper { /// Any length parameters which have been fetched for the type mlir::SmallVector lenParams; - /// If the source variable being privatized definitely can't have non-default - /// lower bounds then we don't need to generate code to read them. - bool cannotHaveNonDefaultLowerBounds; - void createYield(mlir::Value ret) { builder.create(loc, ret); } @@ -452,8 +432,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( // Special case for (possibly allocatable) arrays of polymorphic types // e.g. !fir.class>>> if (source.isPolymorphic()) { - fir::ShapeShiftOp shape = - getShapeShift(builder, loc, source, cannotHaveNonDefaultLowerBounds); + fir::ShapeShiftOp shape = getShapeShift(builder, loc, source); mlir::Type arrayType = source.getElementOrSequenceType(); mlir::Value allocatedArray = builder.create( loc, arrayType, /*typeparams=*/mlir::ValueRange{}, shape.getExtents()); @@ -492,8 +471,8 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray( // Put the temporary inside of a box: // hlfir::genVariableBox doesn't handle non-default lower bounds mlir::Value box; - fir::ShapeShiftOp shapeShift = getShapeShift(builder, loc, getLoadedMoldArg(), - cannotHaveNonDefaultLowerBounds); + fir::ShapeShiftOp shapeShift = + getShapeShift(builder, loc, getLoadedMoldArg()); mlir::Type boxType = getLoadedMoldArg().getType(); if (mlir::isa(temp.getType())) // the box created by the declare form createTempFromMold is missing @@ -628,10 +607,10 @@ void Fortran::lower::omp::populateByRefInitAndCleanupRegions( mlir::Type argType, mlir::Value scalarInitValue, mlir::Block *initBlock, mlir::Value allocatedPrivVarArg, mlir::Value moldArg, mlir::Region &cleanupRegion, DeclOperationKind kind, - const Fortran::semantics::Symbol *sym, bool cannotHaveLowerBounds) { + const Fortran::semantics::Symbol *sym) { PopulateInitAndCleanupRegionsHelper helper( converter, loc, argType, scalarInitValue, allocatedPrivVarArg, moldArg, - initBlock, cleanupRegion, kind, sym, cannotHaveLowerBounds); + initBlock, cleanupRegion, kind, sym); helper.populateByRefInitAndCleanupRegions(); // Often we load moldArg to check something (e.g. length parameters, shape) diff --git a/flang/lib/Lower/OpenMP/PrivateReductionUtils.h b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h index 0a3513bff19b0..fcd36392a29e0 100644 --- a/flang/lib/Lower/OpenMP/PrivateReductionUtils.h +++ b/flang/lib/Lower/OpenMP/PrivateReductionUtils.h @@ -55,13 +55,11 @@ void populateByRefInitAndCleanupRegions( mlir::Value scalarInitValue, mlir::Block *initBlock, mlir::Value allocatedPrivVarArg, mlir::Value moldArg, mlir::Region &cleanupRegion, DeclOperationKind kind, - const Fortran::semantics::Symbol *sym = nullptr, - bool cannotHaveNonDefaultLowerBounds = false); + const Fortran::semantics::Symbol *sym = nullptr); /// Generate a fir::ShapeShift op describing the provided boxed array. fir::ShapeShiftOp getShapeShift(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value box, - bool cannotHaveNonDefaultLowerBounds = false); + mlir::Value box); } // namespace omp } // namespace lower diff --git a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 b/flang/test/Lower/OpenMP/delayed-privatization-array.f90 index c447fa6f27a75..95fa3f9e03052 100644 --- a/flang/test/Lower/OpenMP/delayed-privatization-array.f90 +++ b/flang/test/Lower/OpenMP/delayed-privatization-array.f90 @@ -108,14 +108,15 @@ program main ! ONE_DIM_DEFAULT_LB-SAME: @[[PRIVATIZER_SYM:.*]] : [[BOX_TYPE:!fir.box>]] init { ! ONE_DIM_DEFAULT_LB-NEXT: ^bb0(%[[PRIV_ARG:.*]]: [[TYPE:!fir.ref>>]], %[[PRIV_BOX_ALLOC:.*]]: [[TYPE]]): +! ONE_DIM_DEFAULT_LB-NEXT: %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] ! ONE_DIM_DEFAULT_LB-NEXT: %[[C10:.*]] = arith.constant 10 : index ! ONE_DIM_DEFAULT_LB-NEXT: %[[SHAPE:.*]] = fir.shape %[[C10]] ! ONE_DIM_DEFAULT_LB-NEXT: %[[ARRAY_ALLOC:.*]] = fir.allocmem !fir.array<10xi32> ! ONE_DIM_DEFAULT_LB-NEXT: %[[TRUE:.*]] = arith.constant true ! ONE_DIM_DEFAULT_LB-NEXT: %[[DECL:.*]]:2 = hlfir.declare %[[ARRAY_ALLOC]](%[[SHAPE]]) -! ONE_DIM_DEFAULT_LB-NEXT: %[[ONE:.*]] = arith.constant 1 : index -! ONE_DIM_DEFAULT_LB-NEXT: %[[TEN:.*]] = arith.constant 10 : index -! ONE_DIM_DEFAULT_LB-NEXT: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[ONE]], %[[TEN]] +! ONE_DIM_DEFAULT_LB-NEXT: %[[C0_0:.*]] = arith.constant 0 +! ONE_DIM_DEFAULT_LB-NEXT: %[[DIMS2:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0_0]] +! ONE_DIM_DEFAULT_LB-NEXT: %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIMS2]]#0, %[[DIMS2]]#1 ! ONE_DIM_DEFAULT_LB-NEXT: %[[EMBOX:.*]] = fir.embox %[[DECL]]#0(%[[SHAPE_SHIFT]]) ! ONE_DIM_DEFAULT_LB-NEXT: fir.store %[[EMBOX]] to %[[PRIV_BOX_ALLOC]] ! ONE_DIM_DEFAULT_LB-NEXT: omp.yield(%[[PRIV_BOX_ALLOC]] : [[TYPE]]) From 9fec0a0942f5a11f4dcfec20aa485a8513661720 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 18 Feb 2025 09:14:04 +0000 Subject: [PATCH 097/127] [libclc] Disable external-calls testing for clspv targets (#127529) These targets don't include all OpenCL builtins, so there will always be external calls in the final bytecode module. Fixes #127316. --- libclc/cmake/modules/AddLibclc.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 5347b0822477b..0bf6f98452ecd 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -351,8 +351,9 @@ function(add_libclc_builtin_set) add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} ) set_target_properties( "prepare-${obj_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" ) - # nvptx-- targets don't include workitem builtins - if( NOT ARG_TRIPLE MATCHES ".*ptx.*--$" ) + # nvptx-- targets don't include workitem builtins, and clspv targets don't + # include all OpenCL builtins + if( NOT ARG_ARCH MATCHES "^(nvptx|clspv)(64)?$" ) add_test( NAME external-calls-${obj_suffix} COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) From b8054104d36d0b26b2a445d61ba12cf0fe6ba805 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Tue, 18 Feb 2025 17:14:29 +0800 Subject: [PATCH 098/127] LLVM/Test: Mark Mips readcyclecounter.ll XFAIL: expensive_checks (#127587) expsensive_check complains that: bb.0.entry: %0:gpr32 = RDHWR $hwr2, 0 %1:gpr32 = ADDiu $zero, 0 $v0 = COPY %0:gpr32 $v1 = COPY %1:gpr32 RetRA implicit $v0, implicit $v1 *** Bad machine code: Using an undefined physical register *** - function: test_readcyclecounter - basic block: %bb.0 entry (0xad97ee0) - instruction: %0:gpr32 = RDHWR $hwr2, 0 - operand 1: $hwr2 LLVM ERROR: Found 1 machine code errors. --- llvm/test/CodeGen/Mips/readcyclecounter.ll | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/test/CodeGen/Mips/readcyclecounter.ll b/llvm/test/CodeGen/Mips/readcyclecounter.ll index 467dd92884b3d..23d3ea014f091 100644 --- a/llvm/test/CodeGen/Mips/readcyclecounter.ll +++ b/llvm/test/CodeGen/Mips/readcyclecounter.ll @@ -7,6 +7,8 @@ ;RUN: llc -mtriple=mipsel -mcpu=mips2 < %s | FileCheck %s --check-prefix=MIPSEL_NOT_SUPPORTED ;RUN: llc -mtriple=mips64el -mcpu=mips3 < %s | FileCheck %s --check-prefix=MIPS64EL_NOT_SUPPORTED +; XFAIL: expensive_checks + declare i64 @llvm.readcyclecounter() nounwind readnone define i64 @test_readcyclecounter() nounwind { From 61ab476460516f4bd60a03a46902e801d0db7306 Mon Sep 17 00:00:00 2001 From: Sven van Haastregt Date: Tue, 18 Feb 2025 10:24:14 +0100 Subject: [PATCH 099/127] [SPIR-V] Fix out-of-range value for NumWorkgroups builtin (#127198) The OpenCL C specification states that for out-of-range dimension indices, `get_num_groups` must return 1 instead of 0. --- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 1 + .../CodeGen/SPIRV/opencl/get_num_groups.ll | 55 +++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 llvm/test/CodeGen/SPIRV/opencl/get_num_groups.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index a7a34e0439ab1..aec2aaccb33a8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -1779,6 +1779,7 @@ static bool generateGetQueryInst(const SPIRV::IncomingCall *Call, SPIRV::BuiltIn::BuiltIn Value = SPIRV::lookupGetBuiltin(Call->Builtin->Name, Call->Builtin->Set)->Value; uint64_t IsDefault = (Value == SPIRV::BuiltIn::GlobalSize || + Value == SPIRV::BuiltIn::NumWorkgroups || Value == SPIRV::BuiltIn::WorkgroupSize || Value == SPIRV::BuiltIn::EnqueuedWorkgroupSize); return genWorkgroupQuery(Call, MIRBuilder, GR, Value, IsDefault ? 1 : 0); diff --git a/llvm/test/CodeGen/SPIRV/opencl/get_num_groups.ll b/llvm/test/CodeGen/SPIRV/opencl/get_num_groups.ll new file mode 100644 index 0000000000000..3f1d1dc248fc4 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/get_num_groups.ll @@ -0,0 +1,55 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s + +;; The set of valid inputs for get_num_groups depends on the runtime NDRange, +;; but inputs outside of [0, 2] always return 1. +;; Here we assume Itanium mangling for function name. +declare i64 @_Z14get_num_groupsj(i32) + +define i64 @foo(i32 %dim) { + %x = call i64 @_Z14get_num_groupsj(i32 0) + %y = call i64 @_Z14get_num_groupsj(i32 5) + %acc = add i64 %x, %y + %unknown = call i64 @_Z14get_num_groupsj(i32 %dim) + %ret = add i64 %acc, %unknown + ret i64 %ret +} + +;; Capabilities: +; CHECK-DAG: OpCapability Kernel +; CHECK-DAG: OpCapability Int64 + +; CHECK-NOT: DAG-FENCE + +;; Decorations: +; CHECK-DAG: OpDecorate %[[#GET_NUM_GROUPS:]] BuiltIn NumWorkgroups +; CHECK-DAG: OpDecorate %[[#GET_NUM_GROUPS]] Constant + +; CHECK-NOT: DAG-FENCE + +;; Types, Constants and Variables: +; CHECK-DAG: %[[#BOOL:]] = OpTypeBool +; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#I64:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#VEC:]] = OpTypeVector %[[#I64]] 3 +; CHECK-DAG: %[[#PTR:]] = OpTypePointer Input %[[#VEC]] +; CHECK-DAG: %[[#FN:]] = OpTypeFunction %[[#I64]] %[[#I32]] +; CHECK-DAG: %[[#GET_NUM_GROUPS]] = OpVariable %[[#PTR]] Input +; CHECK-DAG: %[[#ONE:]] = OpConstant %[[#I64]] 1 +; CHECK-DAG: %[[#THREE:]] = OpConstant %[[#I32]] 3 + +;; Functions: +; CHECK: OpFunction %[[#I64]] None %[[#FN]] +; CHECK: %[[#DIM:]] = OpFunctionParameter %[[#I32]] + +;; get_num_groups(0): OpLoad + OpCompositeExtract. +; CHECK: %[[#TMP1:]] = OpLoad %[[#VEC]] %[[#GET_NUM_GROUPS]] +; CHECK: %[[#X:]] = OpCompositeExtract %[[#I64]] %[[#TMP1]] 0 + +;; get_num_groups(5): OpConstant of one. +; CHECK: OpIAdd %[[#I64]] %[[#X]] %[[#ONE]] + +;; get_num_groups(dim): Implementation using OpSelect. +; CHECK-DAG: %[[#TMP2:]] = OpLoad %[[#VEC]] %[[#GET_NUM_GROUPS]] +; CHECK-DAG: %[[#TMP3:]] = OpVectorExtractDynamic %[[#I64]] %[[#TMP2]] %[[#DIM]] +; CHECK-DAG: %[[#COND:]] = OpULessThan %[[#BOOL]] %[[#DIM]] %[[#THREE]] +; CHECK: %[[#UNKNOWN:]] = OpSelect %[[#I64]] %[[#COND]] %[[#TMP3]] %[[#ONE]] From 2fdb26da619cd09e3ccc8d154e48eb0034474823 Mon Sep 17 00:00:00 2001 From: Ming-Yi Lai Date: Tue, 18 Feb 2025 17:27:20 +0800 Subject: [PATCH 100/127] [clang][RISCV] Introduce preprocessor macro when Zicfiss-based shadow stack is enabled (#127592) The `-fcf-protection=[full|return]` flag enables shadow stack implementation based on RISC-V Zicfiss extension. This patch adds the `__riscv_shadow_stack` predefined macro to preprocessing when such a shadow stack implementation is enabled. --- clang/include/clang/Basic/LangOptions.def | 1 + clang/lib/Basic/Targets/RISCV.cpp | 3 ++ clang/lib/Frontend/CompilerInvocation.cpp | 7 ++- .../Preprocessor/riscv-cf-protection-return.c | 44 +++++++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 clang/test/Preprocessor/riscv-cf-protection-return.c diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index bfab0baa089cf..383440ddbc0ea 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -365,6 +365,7 @@ LANGOPT(ObjCDisableDirectMethodsForTesting, 1, 0, LANGOPT(CFProtectionBranch , 1, 0, "Control-Flow Branch Protection enabled") ENUM_LANGOPT(CFBranchLabelScheme, CFBranchLabelSchemeKind, 2, CFBranchLabelSchemeKind::Default, "Control-Flow Branch Protection Label Scheme") +LANGOPT(CFProtectionReturn, 1, 0, "Control-Flow Return Protection enabled") LANGOPT(FakeAddressSpaceMap , 1, 0, "OpenCL fake address space map") ENUM_LANGOPT(AddressSpaceMapMangling , AddrSpaceMapMangling, 2, ASMM_Target, "OpenCL address space map mangling mode") LANGOPT(IncludeDefaultHeader, 1, 0, "Include default header file for OpenCL") diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index b4aa3206fcfab..dff990d15dd62 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -238,6 +238,9 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts, else Builder.defineMacro("__riscv_32e"); } + + if (Opts.CFProtectionReturn && ISAInfo->hasExtension("zicfiss")) + Builder.defineMacro("__riscv_shadow_stack"); } static constexpr int NumRVVBuiltins = diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 014e629c959e2..b9a5c0589ebc4 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4048,8 +4048,13 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) { StringRef Name = A->getValue(); - if (Name == "full" || Name == "branch") { + if (Name == "full") { + Opts.CFProtectionBranch = 1; + Opts.CFProtectionReturn = 1; + } else if (Name == "branch") { Opts.CFProtectionBranch = 1; + } else if (Name == "return") { + Opts.CFProtectionReturn = 1; } } diff --git a/clang/test/Preprocessor/riscv-cf-protection-return.c b/clang/test/Preprocessor/riscv-cf-protection-return.c new file mode 100644 index 0000000000000..3a93a88fa6839 --- /dev/null +++ b/clang/test/Preprocessor/riscv-cf-protection-return.c @@ -0,0 +1,44 @@ +// RUN: %clang --target=riscv32 -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv32 -fcf-protection=return -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv32 -fcf-protection=full -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv32 -march=rv32i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv32 -march=rv32i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -fcf-protection=return -E -dM %s \ +// RUN: -o - | FileCheck --check-prefixes=SHSTK-MACRO %s + +// RUN: %clang --target=riscv32 -march=rv32i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -fcf-protection=full -E -dM %s -o - \ +// RUN: | FileCheck --check-prefixes=SHSTK-MACRO %s + +// RUN: %clang --target=riscv64 -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv64 -fcf-protection=return -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv64 -fcf-protection=full -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv64 -march=rv64i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv64 -march=rv64i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -fcf-protection=return -E -dM %s \ +// RUN: -o - | FileCheck --check-prefixes=SHSTK-MACRO %s + +// RUN: %clang --target=riscv64 -march=rv64i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -fcf-protection=full -E -dM %s -o - \ +// RUN: | FileCheck --check-prefixes=SHSTK-MACRO %s + +// SHSTK-MACRO: __riscv_shadow_stack 1{{$}} +// NO-MACRO-NOT: __riscv_shadow_stack From 2b71df5a74cb5bd67f3f34277749dc920fd35105 Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Tue, 18 Feb 2025 10:50:11 +0100 Subject: [PATCH 101/127] [mlir][x86vector] AVX512-BF16 Convert packed F32 to BF16 (#125685) Adds AVX512 bf16 conversion from packed f32 to bf16 elements. Tests are slightly refactored to better follow file's convention. --- .../mlir/Dialect/X86Vector/X86Vector.td | 40 ++++++++++++++++++ .../Transforms/LegalizeForLLVMExport.cpp | 42 ++++++++++++++++++- .../X86Vector/cvt-packed-f32-to-bf16.mlir | 24 +++++++++++ .../Dialect/X86Vector/legalize-for-llvm.mlir | 18 ++++++++ mlir/test/Dialect/X86Vector/roundtrip.mlir | 20 +++++++++ mlir/test/Target/LLVMIR/x86vector.mlir | 38 +++++++++++++---- 6 files changed, 171 insertions(+), 11 deletions(-) create mode 100644 mlir/test/Dialect/X86Vector/cvt-packed-f32-to-bf16.mlir diff --git a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td index 16181d7e760db..566013e73f4b8 100644 --- a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td +++ b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td @@ -341,6 +341,46 @@ def DotBF16Ps512IntrOp : AVX512_IntrOp<"dpbf16ps.512", 1, [Pure, let results = (outs VectorOfLengthAndType<[16], [F32]>:$res); } +//----------------------------------------------------------------------------// +// Convert packed F32 to packed BF16 +//----------------------------------------------------------------------------// + +def CvtPackedF32ToBF16Op : AVX512_Op<"cvt.packed.f32_to_bf16", [Pure, + AllElementCountsMatch<["a", "dst"]>]> { + let summary = "Convert packed F32 to packed BF16 Data."; + let description = [{ + The `convert_f32_to_bf16` op is an AVX512-BF16 specific op that can lower + to the proper LLVMAVX512BF16 operation `llvm.cvtneps2bf16` depending on + the width of MLIR vectors it is applied to. + + #### From the Intel Intrinsics Guide: + + Convert packed single-precision (32-bit) floating-point elements in `a` to + packed BF16 (16-bit) floating-point elements, and store the results in `dst`. + + Example: + ```mlir + %dst = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<8xf32> -> vector<8xbf16> + ``` + }]; + let arguments = (ins VectorOfLengthAndType<[8, 16], [F32]>:$a); + let results = (outs VectorOfLengthAndType<[8, 16], [BF16]>:$dst); + let assemblyFormat = + "$a attr-dict `:` type($a) `->` type($dst)"; +} + +def CvtNeF32ToBF16Ps256IntrOp : AVX512_IntrOp<"cvtneps2bf16.256", 1, [Pure], + /*extension=*/"bf16"> { + let arguments = (ins VectorOfLengthAndType<[8], [F32]>:$a); + let results = (outs VectorOfLengthAndType<[8], [BF16]>:$res); +} + +def CvtNeF32ToBF16Ps512IntrOp : AVX512_IntrOp<"cvtneps2bf16.512", 1, [Pure], + /*extension=*/"bf16"> { + let arguments = (ins VectorOfLengthAndType<[16], [F32]>:$a); + let results = (outs VectorOfLengthAndType<[16], [BF16]>:$res); +} + //===----------------------------------------------------------------------===// // AVX op definitions //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp index 260ac9ce589a3..f1fbb39b97fc4 100644 --- a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp +++ b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp @@ -131,6 +131,39 @@ struct DotBF16OpConversion : public ConvertOpToLLVMPattern { } }; +struct CvtPackedF32ToBF16Conversion + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(CvtPackedF32ToBF16Op op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto typeA = dyn_cast(op.getA().getType()); + unsigned elemBitWidth = typeA.getElementTypeBitWidth(); + unsigned opBitWidth = typeA.getShape()[0] * elemBitWidth; + + auto opType = op.getDst().getType(); + auto opA = op.getA(); + + switch (opBitWidth) { + case 256: { + rewriter.replaceOpWithNewOp(op, opType, opA); + break; + } + case 512: { + rewriter.replaceOpWithNewOp(op, opType, opA); + break; + } + default: { + return rewriter.notifyMatchFailure( + op, "unsupported AVX512-BF16 packed f32 to bf16 variant"); + } + } + + return success(); + } +}; + struct RsqrtOpConversion : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -202,8 +235,10 @@ using Registry = RegistryImpl< void mlir::populateX86VectorLegalizeForLLVMExportPatterns( const LLVMTypeConverter &converter, RewritePatternSet &patterns) { Registry::registerPatterns(converter, patterns); - patterns.add(converter); + patterns + .add( + converter); } void mlir::configureX86VectorLegalizeForExportTarget( @@ -215,6 +250,9 @@ void mlir::configureX86VectorLegalizeForExportTarget( target.addLegalOp(); target.addLegalOp(); target.addIllegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addIllegalOp(); target.addLegalOp(); target.addIllegalOp(); target.addLegalOp(); diff --git a/mlir/test/Dialect/X86Vector/cvt-packed-f32-to-bf16.mlir b/mlir/test/Dialect/X86Vector/cvt-packed-f32-to-bf16.mlir new file mode 100644 index 0000000000000..c97c52f01c3b0 --- /dev/null +++ b/mlir/test/Dialect/X86Vector/cvt-packed-f32-to-bf16.mlir @@ -0,0 +1,24 @@ +// REQUIRES: target=x86{{.*}} + +// RUN: mlir-opt %s \ +// RUN: -convert-vector-to-llvm="enable-x86vector" -convert-to-llvm \ +// RUN: -reconcile-unrealized-casts | \ +// RUN: mlir-translate --mlir-to-llvmir | \ +// RUN: llc -mcpu=sapphirerapids | \ +// RUN: FileCheck %s + +func.func @avx512bf16_cvt_packed_f32_to_bf16_256( + %a: vector<8xf32>) -> vector<8xbf16> { + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<8xf32> -> vector<8xbf16> + return %0 : vector<8xbf16> +} +// CHECK-LABEL: avx512bf16_cvt_packed_f32_to_bf16_256: +// CHECK: vcvtneps2bf16{{.*}}%xmm + +func.func @avx512bf16_cvt_packed_f32_to_bf16_512( + %a: vector<16xf32>) -> vector<16xbf16> { + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<16xf32> -> vector<16xbf16> + return %0 : vector<16xbf16> +} +// CHECK-LABEL: avx512bf16_cvt_packed_f32_to_bf16_512: +// CHECK: vcvtneps2bf16{{.*}}%ymm diff --git a/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir b/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir index ed9177eaec9ce..59be7dd75b3b0 100644 --- a/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir +++ b/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir @@ -70,6 +70,24 @@ func.func @avx512bf16_dot_512(%src: vector<16xf32>, %a: vector<32xbf16>, return %0 : vector<16xf32> } +// CHECK-LABEL: func @avx512bf16_cvt_packed_f32_to_bf16_256 +func.func @avx512bf16_cvt_packed_f32_to_bf16_256( + %a: vector<8xf32>) -> (vector<8xbf16>) +{ + // CHECK: x86vector.avx512.intr.cvtneps2bf16.256 + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<8xf32> -> vector<8xbf16> + return %0 : vector<8xbf16> +} + +// CHECK-LABEL: func @avx512bf16_cvt_packed_f32_to_bf16_512 +func.func @avx512bf16_cvt_packed_f32_to_bf16_512( + %a: vector<16xf32>) -> (vector<16xbf16>) +{ + // CHECK: x86vector.avx512.intr.cvtneps2bf16.512 + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<16xf32> -> vector<16xbf16> + return %0 : vector<16xbf16> +} + // CHECK-LABEL: func @avx_rsqrt func.func @avx_rsqrt(%a: vector<8xf32>) -> (vector<8xf32>) { diff --git a/mlir/test/Dialect/X86Vector/roundtrip.mlir b/mlir/test/Dialect/X86Vector/roundtrip.mlir index cf74a7ee60255..0d00448c63da8 100644 --- a/mlir/test/Dialect/X86Vector/roundtrip.mlir +++ b/mlir/test/Dialect/X86Vector/roundtrip.mlir @@ -74,6 +74,26 @@ func.func @avx512bf16_dot_512(%src: vector<16xf32>, %a: vector<32xbf16>, return %0 : vector<16xf32> } +// CHECK-LABEL: func @avx512bf16_cvt_packed_f32_to_bf16_256 +func.func @avx512bf16_cvt_packed_f32_to_bf16_256( + %a: vector<8xf32>) -> (vector<8xbf16>) +{ + // CHECK: x86vector.avx512.cvt.packed.f32_to_bf16 {{.*}} : + // CHECK-SAME: vector<8xf32> -> vector<8xbf16> + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<8xf32> -> vector<8xbf16> + return %0 : vector<8xbf16> +} + +// CHECK-LABEL: func @avx512bf16_cvt_packed_f32_to_bf16_512 +func.func @avx512bf16_cvt_packed_f32_to_bf16_512( + %a: vector<16xf32>) -> (vector<16xbf16>) +{ + // CHECK: x86vector.avx512.cvt.packed.f32_to_bf16 {{.*}} : + // CHECK-SAME: vector<16xf32> -> vector<16xbf16> + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<16xf32> -> vector<16xbf16> + return %0 : vector<16xbf16> +} + // CHECK-LABEL: func @avx_rsqrt func.func @avx_rsqrt(%a: vector<8xf32>) -> (vector<8xf32>) { diff --git a/mlir/test/Target/LLVMIR/x86vector.mlir b/mlir/test/Target/LLVMIR/x86vector.mlir index 1df03f10c9321..db1c10cd5cd37 100644 --- a/mlir/test/Target/LLVMIR/x86vector.mlir +++ b/mlir/test/Target/LLVMIR/x86vector.mlir @@ -62,37 +62,57 @@ llvm.func @LLVM_x86_vp2intersect_q_512(%a: vector<8xi64>, %b: vector<8xi64>) // CHECK-LABEL: define <4 x float> @LLVM_x86_avx512bf16_dpbf16ps_128 llvm.func @LLVM_x86_avx512bf16_dpbf16ps_128( - %arg0: vector<4xf32>, %arg1: vector<8xbf16>, %arg2: vector<8xbf16> + %src: vector<4xf32>, %a: vector<8xbf16>, %b: vector<8xbf16> ) -> vector<4xf32> { // CHECK: call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128( - %0 = "x86vector.avx512.intr.dpbf16ps.128"(%arg0, %arg1, %arg2) + %0 = "x86vector.avx512.intr.dpbf16ps.128"(%src, %a, %b) : (vector<4xf32>, vector<8xbf16>, vector<8xbf16>) -> vector<4xf32> llvm.return %0 : vector<4xf32> } // CHECK-LABEL: define <8 x float> @LLVM_x86_avx512bf16_dpbf16ps_256 llvm.func @LLVM_x86_avx512bf16_dpbf16ps_256( - %arg0: vector<8xf32>, %arg1: vector<16xbf16>, %arg2: vector<16xbf16> + %src: vector<8xf32>, %a: vector<16xbf16>, %b: vector<16xbf16> ) -> vector<8xf32> { // CHECK: call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256( - %0 = "x86vector.avx512.intr.dpbf16ps.256"(%arg0, %arg1, %arg2) + %0 = "x86vector.avx512.intr.dpbf16ps.256"(%src, %a, %b) : (vector<8xf32>, vector<16xbf16>, vector<16xbf16>) -> vector<8xf32> llvm.return %0 : vector<8xf32> } // CHECK-LABEL: define <16 x float> @LLVM_x86_avx512bf16_dpbf16ps_512 llvm.func @LLVM_x86_avx512bf16_dpbf16ps_512( - %arg0: vector<16xf32>, %arg1: vector<32xbf16>, %arg2: vector<32xbf16> + %src: vector<16xf32>, %a: vector<32xbf16>, %b: vector<32xbf16> ) -> vector<16xf32> { // CHECK: call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512( - %0 = "x86vector.avx512.intr.dpbf16ps.512"(%arg0, %arg1, %arg2) + %0 = "x86vector.avx512.intr.dpbf16ps.512"(%src, %a, %b) : (vector<16xf32>, vector<32xbf16>, vector<32xbf16>) -> vector<16xf32> llvm.return %0 : vector<16xf32> } +// CHECK-LABEL: define <8 x bfloat> @LLVM_x86_avx512bf16_cvtneps2bf16_256 +llvm.func @LLVM_x86_avx512bf16_cvtneps2bf16_256( + %a: vector<8xf32>) -> vector<8xbf16> +{ + // CHECK: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256( + %0 = "x86vector.avx512.intr.cvtneps2bf16.256"(%a) + : (vector<8xf32>) -> vector<8xbf16> + llvm.return %0 : vector<8xbf16> +} + +// CHECK-LABEL: define <16 x bfloat> @LLVM_x86_avx512bf16_cvtneps2bf16_512 +llvm.func @LLVM_x86_avx512bf16_cvtneps2bf16_512( + %a: vector<16xf32>) -> vector<16xbf16> +{ + // CHECK: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512( + %0 = "x86vector.avx512.intr.cvtneps2bf16.512"(%a) + : (vector<16xf32>) -> vector<16xbf16> + llvm.return %0 : vector<16xbf16> +} + // CHECK-LABEL: define <8 x float> @LLVM_x86_avx_rsqrt_ps_256 llvm.func @LLVM_x86_avx_rsqrt_ps_256(%a: vector <8xf32>) -> vector<8xf32> { @@ -103,11 +123,11 @@ llvm.func @LLVM_x86_avx_rsqrt_ps_256(%a: vector <8xf32>) -> vector<8xf32> // CHECK-LABEL: define <8 x float> @LLVM_x86_avx_dp_ps_256 llvm.func @LLVM_x86_avx_dp_ps_256( - %arg0: vector<8xf32>, %arg1: vector<8xf32> + %a: vector<8xf32>, %b: vector<8xf32> ) -> vector<8xf32> { // CHECK: call <8 x float> @llvm.x86.avx.dp.ps.256( - %0 = llvm.mlir.constant(-1 : i8) : i8 - %1 = "x86vector.avx.intr.dp.ps.256"(%arg0, %arg1, %0) : (vector<8xf32>, vector<8xf32>, i8) -> vector<8xf32> + %c = llvm.mlir.constant(-1 : i8) : i8 + %1 = "x86vector.avx.intr.dp.ps.256"(%a, %b, %c) : (vector<8xf32>, vector<8xf32>, i8) -> vector<8xf32> llvm.return %1 : vector<8xf32> } From bc4f05d8a8a4f908252aba9444571de4398d4288 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 18 Feb 2025 02:08:28 -0800 Subject: [PATCH 102/127] [AMDGPU] Early bail in getFunctionCodeSize for meta inst. NFC. (#127129) It does not change the estimate because getInstSizeInBytes() already returns 0 for meta instructions, but added a test and early bail. --- llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 3 +-- llvm/test/CodeGen/AMDGPU/code-size-estimate.mir | 13 +++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 5179288084010..1123696509818 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -215,8 +215,7 @@ uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. - // TODO: Should we count size of debug info? - if (MI.isDebugInstr()) + if (MI.isMetaInstruction()) continue; CodeSize += TII->getInstSizeInBytes(MI); diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir index 9e46c58b6b5a9..76eaf350301e4 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir @@ -18,3 +18,16 @@ body: | $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4) ... + +# CHECK: meta: ; @meta +# CHECK: ; wave barrier +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: ; codeLenInByte = 4 +--- +name: meta +tracksRegLiveness: true +body: | + bb.0: + + WAVE_BARRIER +... From 7c03865a1ec6de2b734d8cbf75ca2e79ac6d013d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 17:16:50 +0700 Subject: [PATCH 103/127] AMDGPU: Extract lambda used in foldImmediate into a helper function (#127484) It was also too permissive for a more general utilty, only return the original immediate if there is no subregister. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 59 ++++++++++++++++---------- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 9 ++++ 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 4ee5ebd7681b8..07addb38b8711 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3437,6 +3437,30 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const { } } +std::optional SIInstrInfo::extractSubregFromImm(int64_t Imm, + unsigned SubRegIndex) { + switch (SubRegIndex) { + case AMDGPU::NoSubRegister: + return Imm; + case AMDGPU::sub0: + return Lo_32(Imm); + case AMDGPU::sub1: + return Hi_32(Imm); + case AMDGPU::lo16: + return SignExtend64<16>(Imm); + case AMDGPU::hi16: + return SignExtend64<16>(Imm >> 16); + case AMDGPU::sub1_lo16: + return SignExtend64<16>(Imm >> 32); + case AMDGPU::sub1_hi16: + return SignExtend64<16>(Imm >> 48); + default: + return std::nullopt; + } + + llvm_unreachable("covered subregister switch"); +} + bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) @@ -3446,25 +3470,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!getConstValDefinedInReg(DefMI, Reg, Imm)) return false; - auto getImmFor = [=](const MachineOperand &UseOp) -> int64_t { - switch (UseOp.getSubReg()) { - default: - return Imm; - case AMDGPU::sub0: - return Lo_32(Imm); - case AMDGPU::sub1: - return Hi_32(Imm); - case AMDGPU::lo16: - return SignExtend64<16>(Imm); - case AMDGPU::hi16: - return SignExtend64<16>(Imm >> 16); - case AMDGPU::sub1_lo16: - return SignExtend64<16>(Imm >> 32); - case AMDGPU::sub1_hi16: - return SignExtend64<16>(Imm >> 48); - } - }; - assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form"); unsigned Opc = UseMI.getOpcode(); @@ -3480,7 +3485,11 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, : AMDGPU::V_MOV_B32_e32 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::S_MOV_B32; - APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)), + + std::optional SubRegImm = + extractSubregFromImm(Imm, UseMI.getOperand(1).getSubReg()); + + APInt Imm(Is64Bit ? 64 : 32, *SubRegImm, /*isSigned=*/true, /*implicitTrunc=*/true); if (RI.isAGPR(*MRI, DstReg)) { @@ -3591,7 +3600,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (NewOpc == AMDGPU::V_FMAMK_F16_fake16) return false; - const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1); + const std::optional SubRegImm = extractSubregFromImm( + Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg()); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. @@ -3608,7 +3618,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - Src1->ChangeToImmediate(Imm); + Src1->ChangeToImmediate(*SubRegImm); removeModOperands(UseMI); UseMI.setDesc(get(NewOpc)); @@ -3679,8 +3689,11 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + const std::optional SubRegImm = + extractSubregFromImm(Imm, Src2->getSubReg()); + // ChangingToImmediate adds Src2 back to the instruction. - Src2->ChangeToImmediate(getImmFor(*Src2)); + Src2->ChangeToImmediate(*SubRegImm); // These come before src2. removeModOperands(UseMI); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index ddd15e1766f70..79ecc2a657ed0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -401,6 +401,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void removeModOperands(MachineInstr &MI) const; + /// Return the extracted immediate value in a subregister use from a constant + /// materialized in a super register. + /// + /// e.g. %imm = S_MOV_B64 K[0:63] + /// USE %imm.sub1 + /// This will return K[32:63] + static std::optional extractSubregFromImm(int64_t ImmVal, + unsigned SubRegIndex); + bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final; From 1c8add1ec70d8d730572029ac11a70f4dfac8ed5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Don=C3=A1t=20Nagy?= Date: Tue, 18 Feb 2025 11:19:43 +0100 Subject: [PATCH 104/127] [analyzer] Add hack in ArrayBound to cover up missing casts (#127117) Currently there are many casts that are not modeled (i.e. ignored) by the analyzer, which can cause paradox states (e.g. negative value stored in `unsigned` variable) and false positive reports from various checkers, e.g. from `security.ArrayBound`. Unfortunately this issue is deeply rooted in the architectural limitations of the analyzer (if we started to model the casts, it would break other things). For details see the umbrella ticket https://github.com/llvm/llvm-project/issues/39492 This commit adds an ugly hack in `security.ArrayBound` to silence most of the false positives caused by this shortcoming of the engine. Fixes #126884 --- .../Checkers/ArrayBoundChecker.cpp | 99 ++++++++++++++----- clang/test/Analysis/out-of-bounds.c | 22 +++-- 2 files changed, 87 insertions(+), 34 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp index f56e9192d1d66..954b4763034e7 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp @@ -34,24 +34,37 @@ using namespace taint; using llvm::formatv; namespace { -/// If `E` is a "clean" array subscript expression, return the type of the -/// accessed element. If the base of the subscript expression is modified by -/// pointer arithmetic (and not the beginning of a "full" memory region), this -/// always returns nullopt because that's the right (or the least bad) thing to -/// do for the diagnostic output that's relying on this. -static std::optional determineElementType(const Expr *E, - const CheckerContext &C) { +/// If `E` is an array subscript expression with a base that is "clean" (= not +/// modified by pointer arithmetic = the beginning of a memory region), return +/// it as a pointer to ArraySubscriptExpr; otherwise return nullptr. +/// This helper function is used by two separate heuristics that are only valid +/// in these "clean" cases. +static const ArraySubscriptExpr * +getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) { const auto *ASE = dyn_cast(E); if (!ASE) - return std::nullopt; + return nullptr; const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion(); if (!SubscriptBaseReg) - return std::nullopt; + return nullptr; // The base of the subscript expression is affected by pointer arithmetics, - // so we want to report byte offsets instead of indices. + // so we want to report byte offsets instead of indices and we don't want to + // activate the "index is unsigned -> cannot be negative" shortcut. if (isa(SubscriptBaseReg->StripCasts())) + return nullptr; + + return ASE; +} + +/// If `E` is a "clean" array subscript expression, return the type of the +/// accessed element; otherwise return std::nullopt because that's the best (or +/// least bad) option for the diagnostic generation that relies on this. +static std::optional determineElementType(const Expr *E, + const CheckerContext &C) { + const auto *ASE = getAsCleanArraySubscriptExpr(E, C); + if (!ASE) return std::nullopt; return ASE->getType(); @@ -140,7 +153,9 @@ class ArrayBoundChecker : public Checker, ProgramStateRef ErrorState, NonLoc Val, bool MarkTaint); - static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC); + static bool isFromCtypeMacro(const Expr *E, ASTContext &AC); + + static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C); static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State, NonLoc Offset, NonLoc Limit, @@ -587,20 +602,48 @@ void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const { State, ByteOffset, SVB.makeZeroArrayIndex(), SVB); if (PrecedesLowerBound) { - // The offset may be invalid (negative)... - if (!WithinLowerBound) { - // ...and it cannot be valid (>= 0), so report an error. - Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); - reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt); - return; + // The analyzer thinks that the offset may be invalid (negative)... + + if (isOffsetObviouslyNonnegative(E, C)) { + // ...but the offset is obviously non-negative (clear array subscript + // with an unsigned index), so we're in a buggy situation. + + // TODO: Currently the analyzer ignores many casts (e.g. signed -> + // unsigned casts), so it can easily reach states where it will load a + // signed (and negative) value from an unsigned variable. This sanity + // check is a duct tape "solution" that silences most of the ugly false + // positives that are caused by this buggy behavior. Note that this is + // not a complete solution: this cannot silence reports where pointer + // arithmetic complicates the picture and cannot ensure modeling of the + // "unsigned index is positive with highest bit set" cases which are + // "usurped" by the nonsense "unsigned index is negative" case. + // For more information about this topic, see the umbrella ticket + // https://github.com/llvm/llvm-project/issues/39492 + // TODO: Remove this hack once 'SymbolCast's are modeled properly. + + if (!WithinLowerBound) { + // The state is completely nonsense -- let's just sink it! + C.addSink(); + return; + } + // Otherwise continue on the 'WithinLowerBound' branch where the + // unsigned index _is_ non-negative. Don't mention this assumption as a + // note tag, because it would just confuse the users! + } else { + if (!WithinLowerBound) { + // ...and it cannot be valid (>= 0), so report an error. + Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); + reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt); + return; + } + // ...but it can be valid as well, so the checker will (optimistically) + // assume that it's valid and mention this in the note tag. + SUR.recordNonNegativeAssumption(); } - // ...but it can be valid as well, so the checker will (optimistically) - // assume that it's valid and mention this in the note tag. - SUR.recordNonNegativeAssumption(); } // Actually update the state. The "if" only fails in the extremely unlikely - // case when compareValueToThreshold returns {nullptr, nullptr} becasue + // case when compareValueToThreshold returns {nullptr, nullptr} because // evalBinOpNN fails to evaluate the less-than operator. if (WithinLowerBound) State = WithinLowerBound; @@ -660,7 +703,7 @@ void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const { } // Actually update the state. The "if" only fails in the extremely unlikely - // case when compareValueToThreshold returns {nullptr, nullptr} becasue + // case when compareValueToThreshold returns {nullptr, nullptr} because // evalBinOpNN fails to evaluate the less-than operator. if (WithinUpperBound) State = WithinUpperBound; @@ -725,8 +768,8 @@ void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState, C.emitReport(std::move(BR)); } -bool ArrayBoundChecker::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) { - SourceLocation Loc = S->getBeginLoc(); +bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) { + SourceLocation Loc = E->getBeginLoc(); if (!Loc.isMacroID()) return false; @@ -744,6 +787,14 @@ bool ArrayBoundChecker::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) { (MacroName == "isupper") || (MacroName == "isxdigit")); } +bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E, + CheckerContext &C) { + const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C); + if (!ASE) + return false; + return ASE->getIdx()->getType()->isUnsignedIntegerOrEnumerationType(); +} + bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) { ParentMapContext &ParentCtx = ACtx.getParentMapContext(); do { diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c index 7a094b8fdc840..7d6cb4ecf1b24 100644 --- a/clang/test/Analysis/out-of-bounds.c +++ b/clang/test/Analysis/out-of-bounds.c @@ -188,29 +188,31 @@ int test_cast_to_unsigned(signed char x) { if (x >= 0) return x; // FIXME: Here the analyzer ignores the signed -> unsigned cast, and manages to - // load a negative value from an unsigned variable. This causes an underflow - // report, which is an ugly false positive. + // load a negative value from an unsigned variable. // The underlying issue is tracked by Github ticket #39492. clang_analyzer_value(y); // expected-warning {{8s:{ [-128, -1] } }} - return table[y]; // expected-warning {{Out of bound access to memory preceding}} + // However, a hack in the ArrayBound checker suppresses the false positive + // underflow report that would be generated here. + return table[y]; // no-warning } int test_cast_to_unsigned_overflow(signed char x) { unsigned char y = x; if (x >= 0) return x; - // A variant of 'test_cast_to_unsigned' where the correct behavior would be - // an overflow report (because the negative values are cast to `unsigned - // char` values that are too large). - // FIXME: See comment in 'test_cast_to_unsigned'. + // FIXME: As in 'test_cast_to_unsigned', the analyzer thinks that this + // unsigned variable contains a negative value. clang_analyzer_value(y); // expected-warning {{8s:{ [-128, -1] } }} - return small_table[y]; // expected-warning {{Out of bound access to memory preceding}} + // FIXME: The following subscript expression should produce an overflow + // report (because negative signed char corresponds to unsigned char >= 128); + // but the hack in ArrayBound just silences reports and cannot "restore" the + // real execution paths. + return small_table[y]; // no-warning } int test_negative_offset_with_unsigned_idx(void) { // An example where the subscript operator uses an unsigned index, but the - // underflow report is still justified. (We should try to keep this if we - // silence false positives like the one in 'test_cast_to_unsigned'.) + // underflow report is still justified. int *p = table - 10; unsigned idx = 2u; return p[idx]; // expected-warning {{Out of bound access to memory preceding}} From cd10c01767f1d06748c1d4502cc4384a18c06115 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 17:19:53 +0700 Subject: [PATCH 105/127] AMDGPU: Handle subregister uses in SIFoldOperands constant folding (#127485) --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 57 +++++++++++-------- .../AMDGPU/constant-fold-imm-immreg.mir | 34 +++++++++++ 2 files changed, 67 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 7c08a21dea3b8..ab396929162d0 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -123,7 +123,7 @@ class SIFoldOperandsImpl { SmallVectorImpl &FoldList, SmallVectorImpl &CopiesToReplace) const; - MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const; + std::optional getImmOrMaterializedImm(MachineOperand &Op) const; bool tryConstantFoldOp(MachineInstr *MI) const; bool tryFoldCndMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; @@ -1296,21 +1296,22 @@ static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { MI.removeOperand(I); } -MachineOperand * +std::optional SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const { - // If this has a subregister, it obviously is a register source. - if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister || - !Op.getReg().isVirtual()) - return &Op; + if (Op.isImm()) + return Op.getImm(); - MachineInstr *Def = MRI->getVRegDef(Op.getReg()); + if (!Op.isReg() || !Op.getReg().isVirtual()) + return std::nullopt; + + const MachineInstr *Def = MRI->getVRegDef(Op.getReg()); if (Def && Def->isMoveImmediate()) { - MachineOperand &ImmSrc = Def->getOperand(1); + const MachineOperand &ImmSrc = Def->getOperand(1); if (ImmSrc.isImm()) - return &ImmSrc; + return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg()); } - return &Op; + return std::nullopt; } // Try to simplify operations with a constant that may appear after instruction @@ -1325,12 +1326,14 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return false; - MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx)); + + MachineOperand *Src0 = &MI->getOperand(Src0Idx); + std::optional Src0Imm = getImmOrMaterializedImm(*Src0); if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || Opc == AMDGPU::S_NOT_B32) && - Src0->isImm()) { - MI->getOperand(1).ChangeToImmediate(~Src0->getImm()); + Src0Imm) { + MI->getOperand(1).ChangeToImmediate(~*Src0Imm); mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); return true; } @@ -1338,17 +1341,19 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx)); - if (!Src0->isImm() && !Src1->isImm()) + MachineOperand *Src1 = &MI->getOperand(Src1Idx); + std::optional Src1Imm = getImmOrMaterializedImm(*Src1); + + if (!Src0Imm && !Src1Imm) return false; // and k0, k1 -> v_mov_b32 (k0 & k1) // or k0, k1 -> v_mov_b32 (k0 | k1) // xor k0, k1 -> v_mov_b32 (k0 ^ k1) - if (Src0->isImm() && Src1->isImm()) { + if (Src0Imm && Src1Imm) { int32_t NewImm; - if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) + if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm)) return false; bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg()); @@ -1364,12 +1369,13 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (!MI->isCommutable()) return false; - if (Src0->isImm() && !Src1->isImm()) { + if (Src0Imm && !Src1Imm) { std::swap(Src0, Src1); std::swap(Src0Idx, Src1Idx); + std::swap(Src0Imm, Src1Imm); } - int32_t Src1Val = static_cast(Src1->getImm()); + int32_t Src1Val = static_cast(*Src1Imm); if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::V_OR_B32_e32 || Opc == AMDGPU::S_OR_B32) { @@ -1426,9 +1432,12 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (!Src1->isIdenticalTo(*Src0)) { - auto *Src0Imm = getImmOrMaterializedImm(*Src0); - auto *Src1Imm = getImmOrMaterializedImm(*Src1); - if (!Src1Imm->isIdenticalTo(*Src0Imm)) + std::optional Src1Imm = getImmOrMaterializedImm(*Src1); + if (!Src1Imm) + return false; + + std::optional Src0Imm = getImmOrMaterializedImm(*Src0); + if (!Src0Imm || *Src0Imm != *Src1Imm) return false; } @@ -1461,8 +1470,8 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const { MI.getOpcode() != AMDGPU::V_AND_B32_e32) return false; - MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1)); - if (!Src0->isImm() || Src0->getImm() != 0xffff) + std::optional Src0Imm = getImmOrMaterializedImm(MI.getOperand(1)); + if (!Src0Imm || *Src0Imm != 0xffff) return false; Register Src1 = MI.getOperand(2).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 39b5076ebe5ac..807eaf2160b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -927,3 +927,37 @@ body: | S_ENDPGM 0, implicit %3 ... + +--- +name: constant_s_xor_b32_uses_subreg +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_xor_b32_uses_subreg + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 47 + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]] + %0:sreg_64 = S_MOV_B64 32 + %1:sreg_64 = S_MOV_B64 15 + %2:sgpr_32 = S_XOR_B32 %0.sub0, %1.sub0, implicit-def dead $scc + %3:sgpr_32 = S_XOR_B32 %0.sub1, %1.sub1, implicit-def dead $scc + S_ENDPGM 0, implicit %2, implicit %3 + +... + +--- +name: constant_v_or_b32_uses_subreg +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_v_or_b32_uses_subreg + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 268435455, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]] + %0:vreg_64 = V_MOV_B64_PSEUDO 18446744069683019775, implicit $exec + %1:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + %2:vgpr_32 = V_OR_B32_e32 %0.sub0, %1.sub0, implicit $exec + %3:vgpr_32 = V_OR_B32_e32 %0.sub1, %1.sub1, implicit $exec + S_ENDPGM 0, implicit %2, implicit %3 + +... From ef218317d7e30eb32a8e4e1a68d3fe95ca3b8402 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 18 Feb 2025 11:24:15 +0100 Subject: [PATCH 106/127] [bolt][bazel] Port https://github.com/llvm/llvm-project/commit/e235fcb582eec5f58c905b66f96d0732d17b875e. --- utils/bazel/configure.bzl | 13 +++++++++++-- .../bazel/llvm-project-overlay/bolt/BUILD.bazel | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl index c5da28845eccf..fcc9fc7ecc483 100644 --- a/utils/bazel/configure.bzl +++ b/utils/bazel/configure.bzl @@ -172,10 +172,19 @@ def _llvm_configure_impl(repository_ctx): ) # Create a starlark file with the requested LLVM targets. - targets = repository_ctx.attr.targets + llvm_targets = repository_ctx.attr.targets repository_ctx.file( "llvm/targets.bzl", - content = "llvm_targets = " + str(targets), + content = "llvm_targets = " + str(llvm_targets), + executable = False, + ) + + # Create a starlark file with the requested BOLT targets. + bolt_targets = ["AArch64","X86","RISCV"] # Supported targets. + bolt_targets = [t for t in llvm_targets if t in bolt_targets] + repository_ctx.file( + "bolt/targets.bzl", + content = "bolt_targets = " + str(bolt_targets), executable = False, ) diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel index 187938783a550..a9a7cc59575a3 100644 --- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception load("@bazel_skylib//rules:expand_template.bzl", "expand_template") +load(":targets.bzl", "bolt_targets") package( default_visibility = ["//visibility:public"], @@ -16,6 +17,20 @@ genrule( cmd = "echo '#undef BOLT_REVISION' >> $@\n", ) +expand_template( + name = "target_config_def_gen", + out = "include/bolt/Core/TargetConfig.def", + substitutions = {"@BOLT_ENUM_TARGETS@": "\n".join( + ["BOLT_TARGET({})".format(target) for target in bolt_targets], + )}, + template = "include/bolt/Core/TargetConfig.def.in", +) + +cc_library( + name = "TargetConfig", + textual_hdrs = [":target_config_def_gen"], +) + cc_binary( name = "llvm-bolt-heatmap", srcs = glob([ @@ -24,6 +39,7 @@ cc_binary( deps = [ ":Profile", ":Rewrite", + ":TargetConfig", ":Utils", "//llvm:AllTargetsAsmParsers", "//llvm:AllTargetsDisassemblers", @@ -54,6 +70,7 @@ cc_binary( ":Profile", ":Rewrite", ":RuntimeLibs", + ":TargetConfig", ":TargetAArch64", ":TargetX86", ":Utils", From 6646b65082e8bc5f485c744a9c15344011c6aede Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Tue, 18 Feb 2025 10:25:47 +0000 Subject: [PATCH 107/127] [LAA] Rework and rename stripGetElementPtr (#125315) The stripGetElementPtr function is mysteriously named, and calls into another mysterious getGEPInductionOperand which does something complicated with GEP indices. The real purpose of the badly-named stripGetElementPtr function is to get a loop-variant GEP index, if there is one. The getGEPInductionOperand is totally redundant, as stripping off zeros from the end of GEP indices has no effect on computing the loop-variant GEP index, as constant zeros are always loop-invariant. Moreover, the GEP induction operand is simply the first non-zero index from the end, which stripGetElementPtr returns when it finds that any of the GEP indices are loop-variant: this is a completely unrelated value to the GEP index that is loop-variant. The implicit assumption here is that there is only ever one loop-variant index, and it is the first non-zero one from the end. The logic is unnecessarily complicated for what stripGetElementPtr wants to achieve, and the header comments are confusing as well. Strip getGEPInductionOperand, rework and rename stripGetElementPtr. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 61 ++++++------------- .../LoopAccessAnalysis/symbolic-stride.ll | 47 ++++++++++++++ 2 files changed, 64 insertions(+), 44 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 90db89f745e89..5a22ac8abc3fc 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -42,13 +42,12 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" @@ -66,7 +65,6 @@ #include using namespace llvm; -using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-accesses" @@ -2815,50 +2813,25 @@ bool LoopAccessInfo::isInvariant(Value *V) const { return SE->isLoopInvariant(S, TheLoop); } -/// Find the operand of the GEP that should be checked for consecutive -/// stores. This ignores trailing indices that have no effect on the final -/// pointer. -static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { - const DataLayout &DL = Gep->getDataLayout(); - unsigned LastOperand = Gep->getNumOperands() - 1; - TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType()); - - // Walk backwards and try to peel off zeros. - while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { - // Find the type we're currently indexing into. - gep_type_iterator GEPTI = gep_type_begin(Gep); - std::advance(GEPTI, LastOperand - 2); - - // If it's a type with the same allocation size as the result of the GEP we - // can peel off the zero index. - TypeSize ElemSize = GEPTI.isStruct() - ? DL.getTypeAllocSize(GEPTI.getIndexedType()) - : GEPTI.getSequentialElementStride(DL); - if (ElemSize != GEPAllocSize) - break; - --LastOperand; - } - - return LastOperand; -} - -/// If the argument is a GEP, then returns the operand identified by -/// getGEPInductionOperand. However, if there is some other non-loop-invariant -/// operand, it returns that instead. -static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { +/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand. +/// Otherwise, return \p Ptr. +static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE, + Loop *Lp) { auto *GEP = dyn_cast(Ptr); if (!GEP) return Ptr; - unsigned InductionOperand = getGEPInductionOperand(GEP); - - // Check that all of the gep indices are uniform except for our induction - // operand. - for (unsigned I = 0, E = GEP->getNumOperands(); I != E; ++I) - if (I != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(I)), Lp)) - return Ptr; - return GEP->getOperand(InductionOperand); + Value *V = Ptr; + for (const Use &U : GEP->operands()) { + if (!SE->isLoopInvariant(SE->getSCEV(U), Lp)) { + if (V == Ptr) + V = U; + else + // There must be exactly one loop-variant operand. + return Ptr; + } + } + return V; } /// Get the stride of a pointer access in a loop. Looks for symbolic @@ -2873,7 +2846,7 @@ static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *L // pointer, otherwise, we are analyzing the index. Value *OrigPtr = Ptr; - Ptr = stripGetElementPtr(Ptr, SE, Lp); + Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp); const SCEV *V = SE->getSCEV(Ptr); if (Ptr != OrigPtr) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll index 525995156481c..8603417081067 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll @@ -140,6 +140,53 @@ exit: ret void } +; Test with multiple GEP indices +define void @single_stride_array(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride) { +; CHECK-LABEL: 'single_stride_array' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop +; CHECK-NEXT: Backward loop carried data dependence. +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %load = load [2 x i32], ptr %gep.A, align 4 -> +; CHECK-NEXT: store [2 x i32] %ins, ptr %gep.A.next, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Equal predicate: %stride == 1 +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %gep.A = getelementptr inbounds [2 x i32], ptr %A, i64 %mul, i64 1: +; CHECK-NEXT: {(4 + %A),+,(8 * %stride)}<%loop> +; CHECK-NEXT: --> {(4 + %A),+,8}<%loop> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %mul = mul i64 %iv, %stride + %gep.A = getelementptr inbounds [2 x i32], ptr %A, i64 %mul, i64 1 + %load = load [2 x i32], ptr %gep.A, align 4 + %gep.B = getelementptr inbounds [2 x i32], ptr %B, i64 %iv + %load_1 = load [2 x i32], ptr %gep.B, align 4 + %v1 = extractvalue [2 x i32] %load, 0 + %v2 = extractvalue [2 x i32] %load_1, 0 + %add = add i32 %v1, %v2 + %ins = insertvalue [2 x i32] poison, i32 %add, 0 + %iv.next = add nuw nsw i64 %iv, 1 + %gep.A.next = getelementptr inbounds [2 x i32], ptr %A, i64 %iv.next + store [2 x i32] %ins, ptr %gep.A.next, align 4 + %exitcond = icmp eq i64 %iv.next, %N + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + define void @single_stride_castexpr(i32 %offset, ptr %src, ptr %dst, i1 %cond) { ; CHECK-LABEL: 'single_stride_castexpr' ; CHECK-NEXT: inner.loop: From c71f9141a970b6f6d46d27d7c26c7747dd525275 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 18 Feb 2025 10:48:50 +0000 Subject: [PATCH 108/127] [AArch64] Add a phase-ordering test for dividing vscale. NFC See #126411 / #127055, the test isn't expected to fold in a single instcombine iteration, needing instcombine->cse->instcombine. --- .../sve-intrinsic-opts-counting-elems.ll | 17 +++++++++++++++++ .../Transforms/PhaseOrdering/AArch64/vscale.ll | 15 +++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/vscale.ll diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll index 4e7e9eeb7250b..46ca99f4bb27b 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll @@ -240,6 +240,23 @@ define i64 @cntd_all() { } +define i64 @udiv() vscale_range(1, 16) { +; CHECK-LABEL: @udiv( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[A:%.*]] = shl nuw nsw i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[B:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call range(i64 2, 65) i64 @llvm.cttz.i64(i64 [[B]], i1 true) +; CHECK-NEXT: [[C1:%.*]] = lshr i64 [[A]], [[TMP3]] +; CHECK-NEXT: ret i64 [[C1]] +; + %a = call i64 @llvm.aarch64.sve.cntb(i32 31) + %b = call i64 @llvm.aarch64.sve.cntw(i32 31) + %c = udiv i64 %a, %b + ret i64 %c +} + + declare i64 @llvm.aarch64.sve.cntb(i32 %pattern) declare i64 @llvm.aarch64.sve.cnth(i32 %pattern) declare i64 @llvm.aarch64.sve.cntw(i32 %pattern) diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/vscale.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/vscale.ll new file mode 100644 index 0000000000000..7aa50ddf61468 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/vscale.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="default" -mattr=+sve -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +define i64 @udiv() vscale_range(1, 16) { +; CHECK-LABEL: @udiv( +; CHECK-NEXT: ret i64 4 +; + %a = call i64 @llvm.aarch64.sve.cntb(i32 31) + %b = call i64 @llvm.aarch64.sve.cntw(i32 31) + %c = udiv i64 %a, %b + ret i64 %c +} From 41be5bbbdba2939a5fdb82c968c102f993edc4d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Tue, 18 Feb 2025 10:50:21 +0000 Subject: [PATCH 109/127] [mlir][vector] Update tests for xfer permutation lowering (3/N) (#127320) * Remove `vector.create_mask` from tests. Instead, pass masks as arguments. This simplifies the tests without sacrificing test coverage. * Update `@xfer_read_minor_identity_tranposed_with_mask_scalable` to use similar shapes as other tests and to avoid using test Ops (e.g. `@test.some_use`). This improves consistency between tests. * Fix some comment typos. --- .../vector-transfer-permutation-lowering.mlir | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir index 6705905633e0f..dfc79a19e6cc6 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir @@ -1,5 +1,8 @@ // RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s +// TODO: Review the usage of `in_bounds` and remove where not affecting the +// generated output. + /// CHECK: #[[$MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)> ///---------------------------------------------------------------------------------------- @@ -106,8 +109,8 @@ func.func @xfer_write_minor_identity_transposed_map_masked( /// (neither a minor identity nor transposed minor identity map) /// OUT 1: vector.broadcast + vector.transfer_write /// (transposed minor identity) -/// OUT 2: vector.transfer_write -> vector.broadcast + vector.transpose + vector.transfer_write -/// (minor identity) +/// OUT 2: vector.transfer_write -> vector.broadcast + vector.transpose +/// + vector.transfer_write (minor identity) ///---------------------------------------------------------------------------------------- // CHECK-LABEL: func.func @xfer_write_non_minor_identity( @@ -233,16 +236,16 @@ func.func @xfer_write_non_minor_identity_masked_scalable( // CHECK-LABEL: func @xfer_write_non_minor_identity_masked_2 // CHECK-SAME: %[[DEST:.*]]: tensor // CHECK-SAME: %[[VEC:.*]]: vector<14x8x16xf32> -// CHECK-SAME: %[[DIM:.*]]: index, %[[IDX:.*]]: index) -> tensor +// CHECK-SAME: %[[MASK:.*]]: vector<14x8x16xi1> +// CHECK-SAME: %[[DIM:.*]]: index // CHECK-NOT: vector.broadcast -// CHECK: vector.mask %0 { vector.transfer_write %[[VEC]], %[[DEST]]{{.*}} : vector<14x8x16xf32>, tensor } : vector<14x8x16xi1> -> tensor +// CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[VEC]], %[[DEST]]{{.*}} : vector<14x8x16xf32>, tensor } : vector<14x8x16xi1> -> tensor func.func @xfer_write_non_minor_identity_masked_2( %dest : tensor, %vec : vector<14x8x16xf32>, - %dim : index, + %mask: vector<14x8x16xi1>, %idx: index) -> tensor { - %mask = vector.create_mask %dim, %dim, %dim : vector<14x8x16xi1> %res = vector.mask %mask { vector.transfer_write %vec, %dest[%idx, %idx, %idx, %idx] { in_bounds = [false, false, true], @@ -259,29 +262,27 @@ func.func @xfer_write_non_minor_identity_masked_2( /// /// IN: vector.transfer_read /// (_transposed_ minor identity permutation map, with 0 or more broadcast dims) -/// OUT: vector.transpose + vector.transfer_write +/// OUT: vector.transfer_read + vector.broadcast + vector.transpose /// (minor identity permutation map with 0 or more leading broadcast dims) ///---------------------------------------------------------------------------------------- /// TODO: Inner broadcast dim - see also the block at the bottom of this file -// CHECK-LABEL: func.func @xfer_read_minor_identity_tranposed_with_mask +// CHECK-LABEL: func.func @xfer_read_minor_identity_transposed_with_mask // CHECK-SAME: %[[MEM:.*]]: memref, -// CHECK-SAME: %[[DIM_1:.*]]: index, %[[DIM_2:.*]]: index, %[[IDX:.*]]: index) -> vector<8x4x2xf32> { +// CHECK-SAME: %[[MASK:.*]]: vector<2x4xi1> +// CHECK-SAME: %[[IDX:.*]]: index // CHECK: %[[PASS_THROUGH:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_2]], %[[DIM_1]] : vector<2x4xi1> // CHECK: %[[T_READ:.*]] = vector.transfer_read %[[MEM]]{{\[}}%[[IDX]], %[[IDX]]], %[[PASS_THROUGH]], %[[MASK]] {in_bounds = [true, true]} : memref, vector<2x4xf32> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[T_READ]] : vector<2x4xf32> to vector<8x2x4xf32> // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[BCAST]], [0, 2, 1] : vector<8x2x4xf32> to vector<8x4x2xf32> // CHECK: return %[[TRANSPOSE]] : vector<8x4x2xf32> -func.func @xfer_read_minor_identity_tranposed_with_mask( +func.func @xfer_read_minor_identity_transposed_with_mask( %mem: memref, - %dim_1: index, - %dim_2: index, + %mask: vector<2x4xi1>, %idx: index) -> (vector<8x4x2xf32>) { %pad = arith.constant 0.000000e+00 : f32 - %mask = vector.create_mask %dim_2, %dim_1 : vector<2x4xi1> %res = vector.transfer_read %mem[%idx, %idx], %pad, %mask { in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)> @@ -290,24 +291,22 @@ func.func @xfer_read_minor_identity_tranposed_with_mask( return %res : vector<8x4x2xf32> } -// CHECK-LABEL: func.func @xfer_read_minor_identity_tranposed_with_mask_scalable( +// CHECK-LABEL: func.func @xfer_read_minor_identity_transposed_with_mask_scalable( // CHECK-SAME: %[[MEM:.*]]: memref, -// CHECK-SAME: %[[DIM_1:.*]]: index, %[[DIM_2:.*]]: index, %[[IDX:.*]]: index) -> vector<8x[4]x2xf32> { +// CHECK-SAME: %[[MASK:.*]]: vector<2x[4]xi1> +// CHECK-SAME: %[[IDX:.*]]: index // CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_2]], %[[DIM_1]] : vector<2x[4]xi1> // CHECK: %[[T_READ:.*]] = vector.transfer_read %[[MEM]]{{\[}}%[[IDX]], %[[IDX]]], %[[PAD]], %[[MASK]] {in_bounds = [true, true]} : memref, vector<2x[4]xf32> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[T_READ]] : vector<2x[4]xf32> to vector<8x2x[4]xf32> // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[BCAST]], [0, 2, 1] : vector<8x2x[4]xf32> to vector<8x[4]x2xf32> // CHECK: return %[[TRANSPOSE]] : vector<8x[4]x2xf32> -func.func @xfer_read_minor_identity_tranposed_with_mask_scalable( +func.func @xfer_read_minor_identity_transposed_with_mask_scalable( %mem: memref, - %dim_1: index, - %dim_2: index, + %mask: vector<2x[4]xi1>, %idx: index) -> (vector<8x[4]x2xf32>) { %pad = arith.constant 0.000000e+00 : f32 - %mask = vector.create_mask %dim_2, %dim_1 : vector<2x[4]xi1> %res = vector.transfer_read %mem[%idx, %idx], %pad, %mask { in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)> @@ -319,24 +318,26 @@ func.func @xfer_read_minor_identity_tranposed_with_mask_scalable( // Masked version is not supported // CHECK-LABEL: func @xfer_read_minor_identity_transposed_masked( -// CHECK-SAME: %[[DEST:.*]]: tensor, -// CHECK-SAME: %[[MASK:.*]]: vector<4x1xi1> +// CHECK-SAME: %[[DEST:.*]]: tensor, +// CHECK-SAME: %[[MASK:.*]]: vector<2x4xi1> +// CHECK-SAME: %[[IDX:.*]]: index // CHECK-NOT: vector.transpose -// CHECK: vector.mask %[[MASK]] { vector.transfer_read %[[DEST]]{{.*}}: tensor, vector<1x4x4xf32> } : vector<4x1xi1> -> vector<1x4x4xf32> +// CHECK: vector.mask %[[MASK]] { vector.transfer_read %[[DEST]]{{.*}}: tensor, vector<8x4x2xf32> } : vector<2x4xi1> -> vector<8x4x2xf32> func.func @xfer_read_minor_identity_transposed_masked( - %dest: tensor, - %mask : vector<4x1xi1>, - %idx: index) { + %dest: tensor, + %mask: vector<2x4xi1>, + %idx: index) -> (vector<8x4x2xf32>) { %pad = arith.constant 0.000000e+00 : f32 - %3 = vector.mask %mask { + + %res = vector.mask %mask { vector.transfer_read %dest[%idx, %idx], %pad { - permutation_map = affine_map<(d0, d1) -> (d1, 0, d0)> - } : tensor, vector<1x4x4xf32> - } : vector<4x1xi1> -> vector<1x4x4xf32> + in_bounds = [true, true, true], + permutation_map = affine_map<(d0, d1) -> (0, d1, d0)> + } : tensor, vector<8x4x2xf32> + } : vector<2x4xi1> -> vector<8x4x2xf32> - "test.some_use"(%3) : (vector<1x4x4xf32>) -> () - return + return %res : vector<8x4x2xf32> } // CHECK-LABEL: func.func @xfer_read_minor_identity_transposed_masked_scalable( @@ -346,7 +347,7 @@ func.func @xfer_read_minor_identity_transposed_masked( // CHECK: %[[T_READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[DEST]]{{.*}} : tensor, vector<8x[4]x2xf32> } : vector<2x[4]xi1> -> vector<8x[4]x2xf32> func.func @xfer_read_minor_identity_transposed_masked_scalable( %dest: tensor, - %mask : vector<2x[4]xi1>, + %mask: vector<2x[4]xi1>, %idx: index) -> vector<8x[4]x2xf32> { %pad = arith.constant 0.000000e+00 : f32 @@ -388,17 +389,16 @@ func.func @xfer_read_minor_identitiy_bcast_dims_scalable( // CHECK-LABEL: func.func @xfer_read_minor_identitiy_bcast_dims_masked // CHECK-SAME: %[[MEM:.*]]: memref, -// CHECK-SAME: %[[DIM:.*]]: index, +// CHECK-SAME: %[[MASK:.*]]: vector<[4]x3xi1> // CHECK-SAME: %[[IDX:.*]]: index) -> vector<8x[4]x2x3xf32> { // CHECK-NOT: vector.broadcast -// CHECK: %[[MASK:.*]] = vector.mask %0 { vector.transfer_read %[[MEM]]{{.*}} : memref, vector<8x[4]x2x3xf32> } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32> +// CHECK: vector.mask %[[MASK]] { vector.transfer_read %[[MEM]]{{.*}} : memref, vector<8x[4]x2x3xf32> } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32> func.func @xfer_read_minor_identitiy_bcast_dims_masked( %mem: memref, - %dim: index, + %mask: vector<[4]x3xi1>, %idx: index) -> vector<8x[4]x2x3xf32> { %pad = arith.constant 0.000000e+00 : f32 - %mask = vector.create_mask %dim, %dim: vector<[4]x3xi1> %res = vector.mask %mask { vector.transfer_read %mem[%idx, %idx, %idx, %idx], %pad { From 86553788780ec3992eb0ab18815cc12f2782be1a Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Tue, 18 Feb 2025 12:03:37 +0100 Subject: [PATCH 110/127] [SPIR-V] Built-in variables: mapping from an OpenCL to SPIR-V BuiltIn and SPIR-V friendly builtins for Image Read/Write instructions (#127242) This PR improves built-in variables and functions support: * extends mapping from an OpenCL C built-in function to the SPIR-V BuiltIn variables as in https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_Env.html#_built_in_variables, and * adds SPIR-V friendly builtins for Image Read/Write instructions. Test cases are extended accordingly. --- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 16 +++- llvm/lib/Target/SPIRV/SPIRVBuiltins.td | 12 +++ .../CodeGen/SPIRV/builtin_vars-decorate.ll | 85 ++++++++++++++----- .../CodeGen/SPIRV/transcoding/builtin_vars.ll | 9 +- .../CodeGen/SPIRV/transcoding/spirv-types.ll | 58 ++++++++++++- 5 files changed, 153 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index aec2aaccb33a8..7b897f7e34c6f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -216,6 +216,7 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall, // Check if the extracted name begins with: // - "__spirv_ImageSampleExplicitLod" // - "__spirv_ImageRead" + // - "__spirv_ImageWrite" // - "__spirv_ImageQuerySizeLod" // - "__spirv_UDotKHR" // - "__spirv_SDotKHR" @@ -233,20 +234,21 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall, // - "__spirv_SConvert" // - "__spirv_FConvert" // - "__spirv_SatConvert" - // and contains return type information at the end "_R". + // and maybe contains return type information at the end "_R". // If so, extract the plain builtin name without the type information. static const std::regex SpvWithR( - "(__spirv_(ImageSampleExplicitLod|ImageRead|ImageQuerySizeLod|UDotKHR|" + "(__spirv_(ImageSampleExplicitLod|ImageRead|ImageWrite|ImageQuerySizeLod|" + "UDotKHR|" "SDotKHR|SUDotKHR|SDotAccSatKHR|UDotAccSatKHR|SUDotAccSatKHR|" "ReadClockKHR|SubgroupBlockReadINTEL|SubgroupImageBlockReadINTEL|" "SubgroupImageMediaBlockReadINTEL|SubgroupImageMediaBlockWriteINTEL|" "Convert|" - "UConvert|SConvert|FConvert|SatConvert).*)_R[^_]*_?(\\w+)?.*"); + "UConvert|SConvert|FConvert|SatConvert)[^_]*)(_R[^_]*_?(\\w+)?.*)?"); std::smatch Match; if (std::regex_match(BuiltinName, Match, SpvWithR) && Match.size() > 1) { std::ssub_match SubMatch; if (DecorationId && Match.size() > 3) { - SubMatch = Match[3]; + SubMatch = Match[4]; *DecorationId = demangledPostfixToDecorationId(SubMatch.str()); } SubMatch = Match[1]; @@ -1932,6 +1934,9 @@ static bool generateReadImageInst(const StringRef DemangledCall, const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { + if (Call->isSpirvOp()) + return buildOpFromWrapper(MIRBuilder, SPIRV::OpImageRead, Call, + GR->getSPIRVTypeID(Call->ReturnType)); Register Image = Call->Arguments[0]; MachineRegisterInfo *MRI = MIRBuilder.getMRI(); bool HasOclSampler = DemangledCall.contains_insensitive("ocl_sampler"); @@ -2011,6 +2016,9 @@ static bool generateReadImageInst(const StringRef DemangledCall, static bool generateWriteImageInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { + if (Call->isSpirvOp()) + return buildOpFromWrapper(MIRBuilder, SPIRV::OpImageWrite, Call, + Register(0)); MIRBuilder.buildInstr(SPIRV::OpImageWrite) .addUse(Call->Arguments[0]) // Image. .addUse(Call->Arguments[1]) // Coordinate. diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 16f4252173e33..85f42fc08a4e0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -110,11 +110,13 @@ def : DemangledBuiltin<"__spirv_DotAccSat", OpenCL_std, IntegerDot, 3, 3>; def : DemangledBuiltin<"read_imagei", OpenCL_std, ReadImage, 2, 4>; def : DemangledBuiltin<"read_imageui", OpenCL_std, ReadImage, 2, 4>; def : DemangledBuiltin<"read_imagef", OpenCL_std, ReadImage, 2, 4>; +def : DemangledBuiltin<"__spirv_ImageRead", OpenCL_std, ReadImage, 2, 0>; def : DemangledBuiltin<"write_imagef", OpenCL_std, WriteImage, 3, 4>; def : DemangledBuiltin<"write_imagei", OpenCL_std, WriteImage, 3, 4>; def : DemangledBuiltin<"write_imageui", OpenCL_std, WriteImage, 3, 4>; def : DemangledBuiltin<"write_imageh", OpenCL_std, WriteImage, 3, 4>; +def : DemangledBuiltin<"__spirv_ImageWrite", OpenCL_std, WriteImage, 3, 0>; def : DemangledBuiltin<"__translate_sampler_initializer", OpenCL_std, SampleImage, 1, 1>; def : DemangledBuiltin<"__spirv_SampledImage", OpenCL_std, SampleImage, 2, 2>; @@ -1323,6 +1325,15 @@ multiclass DemangledGetBuiltin; +defm : DemangledGetBuiltin<"get_local_linear_id", OpenCL_std, Variable, LocalInvocationIndex>; +defm : DemangledGetBuiltin<"get_work_dim", OpenCL_std, Variable, WorkDim>; +defm : DemangledGetBuiltin<"get_sub_group_size", OpenCL_std, Variable, SubgroupSize>; +defm : DemangledGetBuiltin<"get_max_sub_group_size", OpenCL_std, Variable, SubgroupMaxSize>; +defm : DemangledGetBuiltin<"get_num_sub_groups", OpenCL_std, Variable, NumSubgroups>; +defm : DemangledGetBuiltin<"get_enqueued_num_sub_groups", OpenCL_std, Variable, NumEnqueuedSubgroups>; +defm : DemangledGetBuiltin<"get_sub_group_id", OpenCL_std, Variable, SubgroupId>; +defm : DemangledGetBuiltin<"get_sub_group_local_id", OpenCL_std, Variable, SubgroupLocalInvocationId>; defm : DemangledGetBuiltin<"get_sub_group_eq_mask", OpenCL_std, Variable, SubgroupEqMask>; defm : DemangledGetBuiltin<"get_sub_group_ge_mask", OpenCL_std, Variable, SubgroupGeMask>; defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, SubgroupGtMask>; @@ -1339,6 +1350,7 @@ defm : DemangledGetBuiltin<"get_global_size", OpenCL_std, GetQuery, GlobalSize>; defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>; defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>; +defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>; defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>; //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/SPIRV/builtin_vars-decorate.ll b/llvm/test/CodeGen/SPIRV/builtin_vars-decorate.ll index 59abd5dbee6a0..0c9b29de890d4 100644 --- a/llvm/test/CodeGen/SPIRV/builtin_vars-decorate.ll +++ b/llvm/test/CodeGen/SPIRV/builtin_vars-decorate.ll @@ -1,22 +1,23 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s - -; CHECK: OpName %[[#WD:]] "__spirv_BuiltInWorkDim" -; CHECK: OpName %[[#GS:]] "__spirv_BuiltInGlobalSize" -; CHECK: OpName %[[#GII:]] "__spirv_BuiltInGlobalInvocationId" -; CHECK: OpName %[[#WS:]] "__spirv_BuiltInWorkgroupSize" -; CHECK: OpName %[[#EWS:]] "__spirv_BuiltInEnqueuedWorkgroupSize" -; CHECK: OpName %[[#LLI:]] "__spirv_BuiltInLocalInvocationId" -; CHECK: OpName %[[#NW:]] "__spirv_BuiltInNumWorkgroups" -; CHECK: OpName %[[#WI:]] "__spirv_BuiltInWorkgroupId" -; CHECK: OpName %[[#GO:]] "__spirv_BuiltInGlobalOffset" -; CHECK: OpName %[[#GLI:]] "__spirv_BuiltInGlobalLinearId" -; CHECK: OpName %[[#LLII:]] "__spirv_BuiltInLocalInvocationIndex" -; CHECK: OpName %[[#SS:]] "__spirv_BuiltInSubgroupSize" -; CHECK: OpName %[[#SMS:]] "__spirv_BuiltInSubgroupMaxSize" -; CHECK: OpName %[[#NS:]] "__spirv_BuiltInNumSubgroups" -; CHECK: OpName %[[#NES:]] "__spirv_BuiltInNumEnqueuedSubgroups" -; CHECK: OpName %[[#SI:]] "__spirv_BuiltInSubgroupId" -; CHECK: OpName %[[#SLII:]] "__spirv_BuiltInSubgroupLocalInvocationId" +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: OpName %[[#WD:]] "__spirv_BuiltInWorkDim" +; CHECK-DAG: OpName %[[#GS:]] "__spirv_BuiltInGlobalSize" +; CHECK-DAG: OpName %[[#GII:]] "__spirv_BuiltInGlobalInvocationId" +; CHECK-DAG: OpName %[[#WS:]] "__spirv_BuiltInWorkgroupSize" +; CHECK-DAG: OpName %[[#EWS:]] "__spirv_BuiltInEnqueuedWorkgroupSize" +; CHECK-DAG: OpName %[[#LLI:]] "__spirv_BuiltInLocalInvocationId" +; CHECK-DAG: OpName %[[#NW:]] "__spirv_BuiltInNumWorkgroups" +; CHECK-DAG: OpName %[[#WI:]] "__spirv_BuiltInWorkgroupId" +; CHECK-DAG: OpName %[[#GO:]] "__spirv_BuiltInGlobalOffset" +; CHECK-DAG: OpName %[[#GLI:]] "__spirv_BuiltInGlobalLinearId" +; CHECK-DAG: OpName %[[#LLII:]] "__spirv_BuiltInLocalInvocationIndex" +; CHECK-DAG: OpName %[[#SS:]] "__spirv_BuiltInSubgroupSize" +; CHECK-DAG: OpName %[[#SMS:]] "__spirv_BuiltInSubgroupMaxSize" +; CHECK-DAG: OpName %[[#NS:]] "__spirv_BuiltInNumSubgroups" +; CHECK-DAG: OpName %[[#NES:]] "__spirv_BuiltInNumEnqueuedSubgroups" +; CHECK-DAG: OpName %[[#SI:]] "__spirv_BuiltInSubgroupId" +; CHECK-DAG: OpName %[[#SLII:]] "__spirv_BuiltInSubgroupLocalInvocationId" ; CHECK-DAG: OpDecorate %[[#NW]] BuiltIn NumWorkgroups ; CHECK-DAG: OpDecorate %[[#WS]] BuiltIn WorkgroupSize @@ -35,6 +36,33 @@ ; CHECK-DAG: OpDecorate %[[#NES]] BuiltIn NumEnqueuedSubgroups ; CHECK-DAG: OpDecorate %[[#SI]] BuiltIn SubgroupId ; CHECK-DAG: OpDecorate %[[#SLII]] BuiltIn SubgroupLocalInvocationId + +; CHECK-DAG: %[[#SizeT:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#Int32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#SizeTPtr:]] = OpTypePointer Input %[[#SizeT]] +; CHECK-DAG: %[[#Int32Ptr:]] = OpTypePointer Input %[[#Int32]] + +; CHECK-DAG: %[[#GLI]] = OpVariable %[[#SizeTPtr]] Input +; CHECK-DAG: %[[#LLII]] = OpVariable %[[#SizeTPtr]] Input +; CHECK-DAG: %[[#WD]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#SS]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#SMS]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#NS]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#NES]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#SI]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#SLII]] = OpVariable %[[#Int32Ptr]] Input + +; CHECK: OpFunction +; CHECK: %[[#]] = OpLoad %[[#SizeT]] %[[#GLI]] +; CHECK: %[[#]] = OpLoad %[[#SizeT]] %[[#LLII]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#WD]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#SS]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#SMS]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#NS]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#NES]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#SI]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#SLII]] + @__spirv_BuiltInWorkDim = external addrspace(1) global i32 @__spirv_BuiltInGlobalSize = external addrspace(1) global <3 x i32> @__spirv_BuiltInGlobalInvocationId = external addrspace(1) global <3 x i32> @@ -55,5 +83,24 @@ define spir_kernel void @_Z1wv() { entry: + %r1 = tail call spir_func i64 @get_global_linear_id() + %r2 = tail call spir_func i64 @get_local_linear_id() + %r3 = tail call spir_func i32 @get_work_dim() + %r4 = tail call spir_func i32 @get_sub_group_size() + %r5 = tail call spir_func i32 @get_max_sub_group_size() + %r6 = tail call spir_func i32 @get_num_sub_groups() + %r7 = tail call spir_func i32 @get_enqueued_num_sub_groups() + %r8 = tail call spir_func i32 @get_sub_group_id() + %r9 = tail call spir_func i32 @get_sub_group_local_id() ret void } + +declare spir_func i64 @get_global_linear_id() +declare spir_func i64 @get_local_linear_id() +declare spir_func i32 @get_work_dim() +declare spir_func i32 @get_sub_group_size() +declare spir_func i32 @get_max_sub_group_size() +declare spir_func i32 @get_num_sub_groups() +declare spir_func i32 @get_enqueued_num_sub_groups() +declare spir_func i32 @get_sub_group_id() +declare spir_func i32 @get_sub_group_local_id() diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll index 5074893163565..44d2f5e24f59d 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll @@ -1,8 +1,11 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; CHECK-SPIRV: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId -; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]] +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId +; CHECK: %[[#Id:]] = OpVariable %[[#]] @__spirv_BuiltInGlobalLinearId = external addrspace(1) global i32 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-types.ll index 8d99a0c6cd1ce..36ae6bf478127 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-types.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-types.ll @@ -85,5 +85,61 @@ define spir_func void @test_sampler(target("spirv.Image", float, 1, 1, 0, 0, 0, } declare spir_func target("spirv.Image", float, 1, 1, 0, 0, 0, 0, 0) @_Z20__spirv_SampledImagePU3AS1K34__spirv_Image__float_1_1_0_0_0_0_0PU3AS1K15__spirv_Sampler(target("spirv.Image", float, 1, 1, 0, 0, 0, 0, 0), target("spirv.Sampler")) - declare spir_func <4 x float> @_Z38__spirv_ImageSampleExplicitLod_Rfloat4PU3AS120__spirv_SampledImageDv4_iif(target("spirv.Image", float, 1, 1, 0, 0, 0, 0, 0), <4 x i32>, i32, float) + +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod + +define dso_local spir_kernel void @reads() { + %1 = tail call spir_func i32 @_Z17__spirv_ImageReadIi14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) poison, <4 x i32> zeroinitializer) + %2 = tail call spir_func <2 x i32> @_Z17__spirv_ImageReadIDv2_i14ocl_image2d_roS0_ET_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison, <2 x i32> zeroinitializer) + %3 = tail call spir_func <4 x i32> @_Z17__spirv_ImageReadIDv4_j14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) poison, <4 x i32> zeroinitializer) + %4 = tail call spir_func signext i16 @_Z17__spirv_ImageReadIs14ocl_image1d_roiET_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) poison, i32 0) + %5 = tail call spir_func zeroext i16 @_Z17__spirv_ImageReadIt14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) poison, <4 x i32> zeroinitializer) + %6 = tail call spir_func <2 x float> @_Z17__spirv_ImageReadIDv2_f14ocl_image1d_roiET_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) poison, i32 0) + %7 = tail call spir_func half @_Z17__spirv_ImageReadIDF16_14ocl_image2d_roDv2_iET_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison, <2 x i32> zeroinitializer) + %8 = tail call spir_func <4 x i32> @_Z30__spirv_ImageSampleExplicitLodI32__spirv_SampledImage__image1d_roDv4_jfET0_T_T1_if(target("spirv.SampledImage", void, 0, 0, 0, 0, 0, 0, 0) poison, float 0.000000e+00, i32 2, float 0.000000e+00) + ret void +} + +declare dso_local spir_func i32 @_Z17__spirv_ImageReadIi14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), <4 x i32>) +declare dso_local spir_func <2 x i32> @_Z17__spirv_ImageReadIDv2_i14ocl_image2d_roS0_ET_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), <2 x i32>) +declare dso_local spir_func <4 x i32> @_Z17__spirv_ImageReadIDv4_j14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), <4 x i32>) +declare dso_local spir_func signext i16 @_Z17__spirv_ImageReadIs14ocl_image1d_roiET_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0), i32) +declare dso_local spir_func zeroext i16 @_Z17__spirv_ImageReadIt14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), <4 x i32>) +declare dso_local spir_func <2 x float> @_Z17__spirv_ImageReadIDv2_f14ocl_image1d_roiET_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0), i32) +declare dso_local spir_func half @_Z17__spirv_ImageReadIDF16_14ocl_image2d_roDv2_iET_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), <2 x i32>) +declare dso_local spir_func <4 x i32> @_Z30__spirv_ImageSampleExplicitLodI32__spirv_SampledImage__image1d_roDv4_jfET0_T_T1_if(target("spirv.SampledImage", void, 0, 0, 0, 0, 0, 0, 0), float noundef, i32 noundef, float noundef) + +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite + +define dso_local spir_kernel void @writes() { + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_iiEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) poison, <4 x i32> zeroinitializer, i32 zeroinitializer) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image2d_woDv2_iS1_EvT_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) poison, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_iDv4_jEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) poison, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image1d_woisEvT_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) poison, i32 0, i16 signext 0) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_itEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) poison, <4 x i32> zeroinitializer, i16 zeroext 0) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image1d_woiDv2_fEvT_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) poison, i32 0, <2 x float> zeroinitializer) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image2d_woDv2_iDF16_EvT_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) poison, <2 x i32> zeroinitializer, half zeroinitializer) + ret void +} + +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_iiEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32>, i32) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image2d_woDv2_iS1_EvT_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, <2 x i32>) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_iDv4_jEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32>, <4 x i32>) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image1d_woisEvT_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1), i32, i16 signext) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_itEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32>, i16 zeroext) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image1d_woiDv2_fEvT_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1), i32, <2 x float>) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image2d_woDv2_iDF16_EvT_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, half) From 762001118c068317ec67274221497be2e6499c6a Mon Sep 17 00:00:00 2001 From: Steven Cooreman Date: Tue, 18 Feb 2025 12:12:23 +0100 Subject: [PATCH 111/127] [libc++] Do not guard inclusion of wchar.h with _LIBCPP_HAS_WIDE_CHARACTERS (#126924) `mbstate_t` needs to be visible to libcpp, even when it is not providing wide character functionality (i.e. `_LIBCPP_HAS_WIDE_CHARACTERS` is turned off) and thus not using any of the C library's wide character functions. There are C libraries (such as newlib-nano/nanolib/picolibc) which do provide their definition of `mbstate_t` in `` even though they do not come with wide character functions. Since there is a way to conditionally include the C library's `` only if it exists, we should rely on the fact that if it exists, it will provide `mbstate_t`. Removing this guard will allow using libc++ on top of newlib-nano/picolibc while not breaking the cases where it is used on top of a C library which doesn't provide `` (since it would then still go look for `` or error out). --- libcxx/include/__mbstate_t.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libcxx/include/__mbstate_t.h b/libcxx/include/__mbstate_t.h index e013384454b41..c23ea7113ca70 100644 --- a/libcxx/include/__mbstate_t.h +++ b/libcxx/include/__mbstate_t.h @@ -43,12 +43,12 @@ # include // works on most Unixes #elif __has_include() # include // works on Darwin -#elif _LIBCPP_HAS_WIDE_CHARACTERS && __has_include_next() -# include_next // fall back to the C standard provider of mbstate_t +#elif __has_include_next() +# include_next // use the C standard provider of mbstate_t if present #elif __has_include_next() -# include_next // is also required to make mbstate_t visible +# include_next // Try in absence of for mbstate_t #else -# error "We don't know how to get the definition of mbstate_t without on your platform." +# error "We don't know how to get the definition of mbstate_t on your platform." #endif #endif // _LIBCPP___MBSTATE_T_H From 059f044309a282447fb25073875cff8d2bd96f78 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Tue, 18 Feb 2025 22:10:17 +1100 Subject: [PATCH 112/127] [ORC] Propagate weak & hidden flags when creating lazy reexports, redirectables. Updates JITLinkRedirectableSymbolManager to take alias flags into account when setting the scope and linkage of the created stubs (weak aliases get now get weak linkage, hidden stubs get hidden visibility). Updates lazyReexports to propagate alias flags (rather than trampoline flags) when building the initial destinations map for the redirectable symbols manager. Together these changes allow the LazyObjectLinkingLayer to link objects containing weak and hidden symbols. --- .../orc/TestCases/Generic/Inputs/bar-ret-void-weak.ll | 4 ++++ .../TestCases/Generic/Inputs/baz-ret-void-hidden.ll | 4 ++++ compiler-rt/test/orc/TestCases/Generic/lazy-link.ll | 10 +++++++++- .../Orc/JITLinkRedirectableSymbolManager.cpp | 5 ++++- llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp | 2 +- 5 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 compiler-rt/test/orc/TestCases/Generic/Inputs/bar-ret-void-weak.ll create mode 100644 compiler-rt/test/orc/TestCases/Generic/Inputs/baz-ret-void-hidden.ll diff --git a/compiler-rt/test/orc/TestCases/Generic/Inputs/bar-ret-void-weak.ll b/compiler-rt/test/orc/TestCases/Generic/Inputs/bar-ret-void-weak.ll new file mode 100644 index 0000000000000..7301b43e7c92d --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Generic/Inputs/bar-ret-void-weak.ll @@ -0,0 +1,4 @@ +define weak void @bar() { +entry: + ret void +} diff --git a/compiler-rt/test/orc/TestCases/Generic/Inputs/baz-ret-void-hidden.ll b/compiler-rt/test/orc/TestCases/Generic/Inputs/baz-ret-void-hidden.ll new file mode 100644 index 0000000000000..27e19deea6ebd --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Generic/Inputs/baz-ret-void-hidden.ll @@ -0,0 +1,4 @@ +define hidden void @baz() { +entry: + ret void +} diff --git a/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll b/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll index 5a8dbfc532b0f..1c375bcf1e62f 100644 --- a/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll +++ b/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll @@ -6,9 +6,11 @@ ; RUN: rm -rf %t && mkdir -p %t ; RUN: %clang -c -o %t/foo.o %S/Inputs/foo-ret-42.ll ; RUN: %clang -c -o %t/x.o %S/Inputs/var-x-42.ll +; RUN: %clang -c -o %t/bar.o %S/Inputs/bar-ret-void-weak.ll +; RUN: %clang -c -o %t/baz.o %S/Inputs/baz-ret-void-hidden.ll ; RUN: %clang -c -o %t/main.o %s ; RUN: %llvm_jitlink -noexec -show-linked-files %t/main.o -lazy %t/foo.o \ -; RUN: -lazy %t/x.o | FileCheck %s +; RUN: -lazy %t/x.o -lazy %t/bar.o -lazy %t/baz.o | FileCheck %s ; ; UNSUPPORTED: system-windows ; REQUIRES: target={{(arm|aarch|x86_)64.*}} @@ -21,9 +23,15 @@ declare i32 @foo() @x = external global i32 +declare void @bar() +declare hidden void @baz() + + define i32 @main(i32 %argc, ptr %argv) { entry: %foo_result = call i32 @foo() + call void @bar() + call void @baz() %x_val = load i32, ptr @x %result = add nsw i32 %foo_result, %x_val ret i32 %result diff --git a/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp b/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp index 06c545d62d76a..b5b380971d204 100644 --- a/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp @@ -47,7 +47,10 @@ void JITLinkRedirectableSymbolManager::emitRedirectableSymbols( Ptr.setScope(jitlink::Scope::Hidden); auto &Stub = PtrJumpStubCreator(*G, StubsSection, Ptr); Stub.setName(Name); - Stub.setScope(jitlink::Scope::Default); + Stub.setScope(Def.getFlags().isExported() ? jitlink::Scope::Default + : jitlink::Scope::Hidden); + Stub.setLinkage(!Def.getFlags().isWeak() ? jitlink::Linkage::Strong + : jitlink::Linkage::Weak); NewSymbols[std::move(PtrName)] = JITSymbolFlags(); } diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp index 7b38621eba824..80f2a1304dde7 100644 --- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp @@ -387,7 +387,7 @@ void LazyReexportsManager::emitRedirectableSymbols( SymbolMap Redirs; size_t I = 0; for (auto &[Name, AI] : Reexports) - Redirs[Name] = (*ReentryPoints)[I++]; + Redirs[Name] = {(*ReentryPoints)[I++].getAddress(), AI.AliasFlags}; I = 0; if (!Reexports.empty()) { From 719c46b35958782622e300696fbe6af6549b4cdc Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Tue, 18 Feb 2025 12:42:37 +0100 Subject: [PATCH 113/127] [FunctionAttrs] Fix typo in `getArgumentAccessInfo` name (NFC) --- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index a66d7ce9c3f50..02b0fcb3981a7 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -651,9 +651,9 @@ struct ArgumentUsesSummary { SmallDenseMap UsesPerBlock; }; -ArgumentAccessInfo getArgmentAccessInfo(const Instruction *I, - const ArgumentUse &ArgUse, - const DataLayout &DL) { +ArgumentAccessInfo getArgumentAccessInfo(const Instruction *I, + const ArgumentUse &ArgUse, + const DataLayout &DL) { auto GetTypeAccessRange = [&DL](Type *Ty, std::optional Offset) -> std::optional { @@ -805,7 +805,7 @@ ArgumentUsesSummary collectArgumentUsesPerBlock(Argument &A, Function &F) { } auto *I = cast(U); - bool HasWrite = UpdateUseInfo(I, getArgmentAccessInfo(I, ArgUse, DL)); + bool HasWrite = UpdateUseInfo(I, getArgumentAccessInfo(I, ArgUse, DL)); Result.HasAnyWrite |= HasWrite; From 519b53e65ef6ad5385d80d0726e48bbc3b08992f Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Tue, 18 Feb 2025 17:25:06 +0530 Subject: [PATCH 114/127] [CodeGen][NewPM] Port RegAllocEvictionAdvisor analysis to NPM (#117309) Legacy pass used to provide the advisor, so this extracts that logic into a provider class used by both analysis passes. All three (Default, Release, Development) legacy passes `*AdvisorAnalysis` are basically renamed to `*AdvisorProvider`, so the actual legacy wrapper passes are `*AdvisorAnalysisLegacy`. There is only one NPM analysis `RegAllocEvictionAnalysis` that switches between the three providers in the `::run` method, to be cached by the NPM. Also adds `RequireAnalysis` to the optimized target reg alloc codegen builder. --- .../llvm}/CodeGen/RegAllocEvictionAdvisor.h | 98 +++++++- llvm/include/llvm/InitializePasses.h | 2 +- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 1 + .../llvm/Passes/MachinePassRegistry.def | 1 + llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp | 210 +++++++++++------- llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp | 130 ++++++++--- llvm/lib/CodeGen/RegAllocGreedy.cpp | 13 +- llvm/lib/CodeGen/RegAllocGreedy.h | 1 - llvm/lib/CodeGen/RegAllocPriorityAdvisor.h | 2 +- llvm/lib/Passes/PassBuilder.cpp | 1 + 10 files changed, 330 insertions(+), 129 deletions(-) rename llvm/{lib => include/llvm}/CodeGen/RegAllocEvictionAdvisor.h (71%) diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/include/llvm/CodeGen/RegAllocEvictionAdvisor.h similarity index 71% rename from llvm/lib/CodeGen/RegAllocEvictionAdvisor.h rename to llvm/include/llvm/CodeGen/RegAllocEvictionAdvisor.h index 52dd946a68540..a14548ff6959e 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h +++ b/llvm/include/llvm/CodeGen/RegAllocEvictionAdvisor.h @@ -9,13 +9,18 @@ #ifndef LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H #define LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H +#include "llvm/ADT/Any.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Register.h" #include "llvm/Config/llvm-config.h" +#include "llvm/IR/PassManager.h" #include "llvm/MC/MCRegister.h" #include "llvm/Pass.h" +#include "llvm/Support/Compiler.h" namespace llvm { class AllocationOrder; @@ -149,6 +154,35 @@ class RegAllocEvictionAdvisor { const bool EnableLocalReassign; }; +/// Common provider for legacy and new pass managers. +/// This keeps the state for logging, and sets up and holds the provider. +/// The legacy pass itself used to keep the logging state and provider, +/// so this extraction helps the NPM analysis to reuse the logic. +/// TODO: Coalesce this with the NPM analysis when legacy PM is removed. +class RegAllocEvictionAdvisorProvider { +public: + enum class AdvisorMode : int { Default, Release, Development }; + RegAllocEvictionAdvisorProvider(AdvisorMode Mode, LLVMContext &Ctx) + : Ctx(Ctx), Mode(Mode) {} + + virtual ~RegAllocEvictionAdvisorProvider() = default; + + virtual void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) {} + + virtual std::unique_ptr + getAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MachineBlockFrequencyInfo *MBFI, MachineLoopInfo *Loops) = 0; + + AdvisorMode getAdvisorMode() const { return Mode; } + +protected: + LLVMContext &Ctx; + +private: + const AdvisorMode Mode; +}; + /// ImmutableAnalysis abstraction for fetching the Eviction Advisor. We model it /// as an analysis to decouple the user from the implementation insofar as /// dependencies on other analyses goes. The motivation for it being an @@ -164,20 +198,20 @@ class RegAllocEvictionAdvisor { /// /// Because we need to offer additional services in 'development' mode, the /// implementations of this analysis need to implement RTTI support. -class RegAllocEvictionAdvisorAnalysis : public ImmutablePass { +class RegAllocEvictionAdvisorAnalysisLegacy : public ImmutablePass { public: enum class AdvisorMode : int { Default, Release, Development }; - RegAllocEvictionAdvisorAnalysis(AdvisorMode Mode) - : ImmutablePass(ID), Mode(Mode){}; + RegAllocEvictionAdvisorAnalysisLegacy(AdvisorMode Mode) + : ImmutablePass(ID), Mode(Mode) {}; static char ID; /// Get an advisor for the given context (i.e. machine function, etc) - virtual std::unique_ptr - getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0; + RegAllocEvictionAdvisorProvider &getProvider() { return *Provider; } + AdvisorMode getAdvisorMode() const { return Mode; } virtual void logRewardIfNeeded(const MachineFunction &MF, - llvm::function_ref GetReward){}; + function_ref GetReward) {}; protected: // This analysis preserves everything, and subclasses may have additional @@ -185,19 +219,65 @@ class RegAllocEvictionAdvisorAnalysis : public ImmutablePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } + std::unique_ptr Provider; private: StringRef getPassName() const override; const AdvisorMode Mode; }; +/// A MachineFunction analysis for fetching the Eviction Advisor. +/// This sets up the Provider lazily and caches it. +/// - in the ML implementation case, the evaluator is stateless but (especially +/// in the development mode) expensive to set up. With a Module Analysis, we +/// `require` it and set it up once. +/// - in the 'development' mode ML case, we want to capture the training log +/// during allocation (this is a log of features encountered and decisions +/// made), and then measure a score, potentially a few steps after allocation +/// completes. So we need a Module analysis to keep the logger state around +/// until we can make that measurement. +class RegAllocEvictionAdvisorAnalysis + : public AnalysisInfoMixin { + static AnalysisKey Key; + friend AnalysisInfoMixin; + +public: + struct Result { + // owned by this analysis + RegAllocEvictionAdvisorProvider *Provider; + + bool invalidate(MachineFunction &MF, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &Inv) { + // Provider is stateless and constructed only once. Do not get + // invalidated. + return false; + } + }; + + Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MAM); + +private: + void + initializeProvider(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode Mode, + LLVMContext &Ctx); + + std::unique_ptr Provider; +}; + /// Specialization for the API used by the analysis infrastructure to create /// an instance of the eviction advisor. -template <> Pass *callDefaultCtor(); +template <> Pass *callDefaultCtor(); + +RegAllocEvictionAdvisorAnalysisLegacy *createReleaseModeAdvisorAnalysisLegacy(); + +RegAllocEvictionAdvisorAnalysisLegacy * +createDevelopmentModeAdvisorAnalysisLegacy(); -RegAllocEvictionAdvisorAnalysis *createReleaseModeAdvisor(); +LLVM_ATTRIBUTE_RETURNS_NONNULL RegAllocEvictionAdvisorProvider * +createReleaseModeAdvisorProvider(LLVMContext &Ctx); -RegAllocEvictionAdvisorAnalysis *createDevelopmentModeAdvisor(); +RegAllocEvictionAdvisorProvider * +createDevelopmentModeAdvisorProvider(LLVMContext &Ctx); // TODO: move to RegAllocEvictionAdvisor.cpp when we move implementation // out of RegAllocGreedy.cpp diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index da4ffcd83213a..81a602c8889d8 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -251,7 +251,7 @@ void initializePseudoProbeInserterPass(PassRegistry &); void initializeRAGreedyPass(PassRegistry &); void initializeReachingDefAnalysisPass(PassRegistry &); void initializeReassociateLegacyPassPass(PassRegistry &); -void initializeRegAllocEvictionAdvisorAnalysisPass(PassRegistry &); +void initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &); void initializeRegAllocFastPass(PassRegistry &); void initializeRegAllocPriorityAdvisorAnalysisPass(PassRegistry &); void initializeRegAllocScoringPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 1458318ff021a..12781e2b84623 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -57,6 +57,7 @@ #include "llvm/CodeGen/PeepholeOptimizer.h" #include "llvm/CodeGen/PostRASchedulerList.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/RegAllocFast.h" #include "llvm/CodeGen/RegUsageInfoCollector.h" #include "llvm/CodeGen/RegUsageInfoPropagate.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 075ebcb829553..2b5e258682585 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -114,6 +114,7 @@ MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree", MachinePostDominatorTreeAnalysis()) MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis()) MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) +MACHINE_FUNCTION_ANALYSIS("regalloc-evict", RegAllocEvictionAdvisorAnalysis()) MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis()) MACHINE_FUNCTION_ANALYSIS("spill-code-placement", SpillPlacementAnalysis()) MACHINE_FUNCTION_ANALYSIS("virtregmap", VirtRegMapAnalysis()) diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp index 9656774c6eaae..1a8e11de909e8 100644 --- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp @@ -11,11 +11,11 @@ //===----------------------------------------------------------------------===// #include "AllocationOrder.h" -#include "RegAllocEvictionAdvisor.h" #include "RegAllocGreedy.h" #include "llvm/Analysis/InteractiveModelRunner.h" #include "llvm/Analysis/MLModelRunner.h" #include "llvm/Analysis/TensorSpec.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) || defined(LLVM_HAVE_TFLITE) #include "llvm/Analysis/ModelUnderTrainingRunner.h" #include "llvm/Analysis/NoInferenceModelRunner.h" @@ -115,7 +115,7 @@ class RegAllocScoring : public MachineFunctionPass { /// RegAllocReward analysis usage. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -389,11 +389,12 @@ class MLEvictAdvisor : public RegAllocEvictionAdvisor { // =================================== // Release (AOT) - specifics // =================================== -class ReleaseModeEvictionAdvisorAnalysis final - : public RegAllocEvictionAdvisorAnalysis { +/// Common provider for legacy and new pass managers. +class ReleaseModeEvictionAdvisorProvider final + : public RegAllocEvictionAdvisorProvider { public: - ReleaseModeEvictionAdvisorAnalysis() - : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) { + ReleaseModeEvictionAdvisorProvider(LLVMContext &Ctx) + : RegAllocEvictionAdvisorProvider(AdvisorMode::Release, Ctx) { if (EnableDevelopmentFeatures) { InputFeatures = {RA_EVICT_FEATURES_LIST( _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES) @@ -403,21 +404,13 @@ class ReleaseModeEvictionAdvisorAnalysis final } } // support for isa<> and dyn_cast. - static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { + static bool classof(const RegAllocEvictionAdvisorProvider *R) { return R->getAdvisorMode() == AdvisorMode::Release; } -private: - std::vector InputFeatures; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU); - } - std::unique_ptr - getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + getAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MachineBlockFrequencyInfo *MBFI, MachineLoopInfo *Loops) override { if (!Runner) { if (InteractiveChannelBaseName.empty()) Runner = std::make_unique>( @@ -428,14 +421,45 @@ class ReleaseModeEvictionAdvisorAnalysis final InteractiveChannelBaseName + ".out", InteractiveChannelBaseName + ".in"); } - return std::make_unique( - MF, RA, Runner.get(), - getAnalysis().getMBFI(), - getAnalysis().getLI()); + assert(MBFI && Loops && + "Invalid provider state: must have analysis available"); + return std::make_unique(MF, RA, Runner.get(), *MBFI, + *Loops); } + +private: + std::vector InputFeatures; std::unique_ptr Runner; }; +class ReleaseModeEvictionAdvisorAnalysisLegacy final + : public RegAllocEvictionAdvisorAnalysisLegacy { +public: + ReleaseModeEvictionAdvisorAnalysisLegacy() + : RegAllocEvictionAdvisorAnalysisLegacy(AdvisorMode::Release) {} + + void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) override { + // No-op in release mode + } + + bool doInitialization(Module &M) override { + Provider = + std::make_unique(M.getContext()); + return false; + } + + static bool classof(const RegAllocEvictionAdvisorAnalysisLegacy *R) { + return R->getAdvisorMode() == AdvisorMode::Release; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + RegAllocEvictionAdvisorAnalysisLegacy::getAnalysisUsage(AU); + } +}; + // =================================== // Development mode-specifics // =================================== @@ -468,11 +492,11 @@ class DevelopmentModeEvictAdvisor : public MLEvictAdvisor { Logger *const Log; }; -class DevelopmentModeEvictionAdvisorAnalysis final - : public RegAllocEvictionAdvisorAnalysis { +class DevelopmentModeEvictionAdvisorProvider final + : public RegAllocEvictionAdvisorProvider { public: - DevelopmentModeEvictionAdvisorAnalysis() - : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) { + DevelopmentModeEvictionAdvisorProvider(LLVMContext &Ctx) + : RegAllocEvictionAdvisorProvider(AdvisorMode::Development, Ctx) { if (EnableDevelopmentFeatures) { InputFeatures = {RA_EVICT_FEATURES_LIST( _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES) @@ -492,44 +516,10 @@ class DevelopmentModeEvictionAdvisorAnalysis final TensorSpec::createSpec("action_step_type", {1}), TensorSpec::createSpec("action_reward", {1})}; } - } - // support for isa<> and dyn_cast. - static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { - return R->getAdvisorMode() == AdvisorMode::Development; - } - - void logRewardIfNeeded(const MachineFunction &MF, - llvm::function_ref GetReward) override { - if (!Log || !Log->hasAnyObservationForContext(MF.getName())) - return; - // The function pass manager would run all the function passes for a - // function, so we assume the last context belongs to this function. If - // this invariant ever changes, we can implement at that time switching - // contexts. At this point, it'd be an error - if (Log->currentContext() != MF.getName()) { - MF.getFunction().getContext().emitError( - "The training log context shouldn't have had changed."); - } - if (Log->hasObservationInProgress()) - Log->logReward(GetReward()); - } - -private: - std::vector InputFeatures; - std::vector TrainingInputFeatures; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU); - } - - bool doInitialization(Module &M) override { - LLVMContext &Ctx = M.getContext(); if (ModelUnderTraining.empty() && TrainingLog.empty()) { Ctx.emitError("Regalloc development mode should be requested with at " "least logging enabled and/or a training model"); - return false; + return; } if (ModelUnderTraining.empty()) Runner = std::make_unique(Ctx, InputFeatures); @@ -538,15 +528,15 @@ class DevelopmentModeEvictionAdvisorAnalysis final Ctx, ModelUnderTraining, DecisionName, TrainingInputFeatures); if (!Runner) { Ctx.emitError("Regalloc: could not set up the model runner"); - return false; + return; } if (TrainingLog.empty()) - return false; + return; std::error_code EC; auto OS = std::make_unique(TrainingLog, EC); if (EC) { - M.getContext().emitError(EC.message() + ":" + TrainingLog); - return false; + Ctx.emitError(EC.message() + ":" + TrainingLog); + return; } std::vector LFS = InputFeatures; if (auto *MUTR = dyn_cast(Runner.get())) @@ -558,25 +548,80 @@ class DevelopmentModeEvictionAdvisorAnalysis final Log = std::make_unique(std::move(OS), LFS, Reward, /*IncludeReward*/ true); - return false; + return; + } + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorProvider *R) { + return R->getAdvisorMode() == AdvisorMode::Development; + } + + void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) override { + if (!Log || !Log->hasAnyObservationForContext(MF.getName())) + return; + // The function pass manager would run all the function passes for a + // function, so we assume the last context belongs to this function. If + // this invariant ever changes, we can implement at that time switching + // contexts. At this point, it'd be an error + if (Log->currentContext() != MF.getName()) { + MF.getFunction().getContext().emitError( + "The training log context shouldn't have had changed."); + } + if (Log->hasObservationInProgress()) + Log->logReward(GetReward()); } std::unique_ptr - getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + getAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MachineBlockFrequencyInfo *MBFI, MachineLoopInfo *Loops) override { if (!Runner) return nullptr; if (Log) Log->switchContext(MF.getName()); + assert(MBFI && Loops && + "Invalid provider state: must have analysis available"); return std::make_unique( - MF, RA, Runner.get(), - getAnalysis().getMBFI(), - getAnalysis().getLI(), Log.get()); + MF, RA, Runner.get(), *MBFI, *Loops, Log.get()); } +private: + std::vector InputFeatures; + std::vector TrainingInputFeatures; + std::unique_ptr Runner; std::unique_ptr Log; }; +class DevelopmentModeEvictionAdvisorAnalysisLegacy final + : public RegAllocEvictionAdvisorAnalysisLegacy { +public: + DevelopmentModeEvictionAdvisorAnalysisLegacy() + : RegAllocEvictionAdvisorAnalysisLegacy(AdvisorMode::Development) {} + + bool doInitialization(Module &M) override { + Provider = std::make_unique( + M.getContext()); + return false; + } + + void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) override { + Provider->logRewardIfNeeded(MF, GetReward); + } + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorAnalysisLegacy *R) { + return R->getAdvisorMode() == AdvisorMode::Development; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + RegAllocEvictionAdvisorAnalysisLegacy::getAnalysisUsage(AU); + } +}; + #endif // #ifdef LLVM_HAVE_TFLITE } // namespace @@ -1127,8 +1172,9 @@ void llvm::extractMBBFrequency( // Development mode-specific implementations #ifdef LLVM_HAVE_TFLITE -RegAllocEvictionAdvisorAnalysis *llvm::createDevelopmentModeAdvisor() { - return new DevelopmentModeEvictionAdvisorAnalysis(); +RegAllocEvictionAdvisorAnalysisLegacy * +llvm::createDevelopmentModeAdvisorAnalysisLegacy() { + return new DevelopmentModeEvictionAdvisorAnalysisLegacy(); } int64_t DevelopmentModeEvictAdvisor::tryFindEvictionCandidatePosition( @@ -1194,18 +1240,32 @@ bool RegAllocScoring::runOnMachineFunction(MachineFunction &MF) { return *CachedReward; }; - getAnalysis().logRewardIfNeeded(MF, - GetReward); + getAnalysis().logRewardIfNeeded( + MF, GetReward); getAnalysis().logRewardIfNeeded(MF, GetReward); return false; } #endif // #ifdef LLVM_HAVE_TFLITE -RegAllocEvictionAdvisorAnalysis *llvm::createReleaseModeAdvisor() { +RegAllocEvictionAdvisorProvider * +llvm::createReleaseModeAdvisorProvider(LLVMContext &Ctx) { + return new ReleaseModeEvictionAdvisorProvider(Ctx); +} + +RegAllocEvictionAdvisorProvider * +llvm::createDevelopmentModeAdvisorProvider(LLVMContext &Ctx) { +#if defined(LLVM_HAVE_TFLITE) + return new DevelopmentModeEvictionAdvisorProvider(Ctx); +#endif + return nullptr; +} + +RegAllocEvictionAdvisorAnalysisLegacy * +llvm::createReleaseModeAdvisorAnalysisLegacy() { return llvm::isEmbeddedModelEvaluatorValid() || !InteractiveChannelBaseName.empty() - ? new ReleaseModeEvictionAdvisorAnalysis() + ? new ReleaseModeEvictionAdvisorAnalysisLegacy() : nullptr; } diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp index a1f441ebd0d5e..2369615ef0fb6 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp @@ -9,12 +9,14 @@ // Implementation of the default eviction advisor and of the Analysis pass. // //===----------------------------------------------------------------------===// - -#include "RegAllocEvictionAdvisor.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "AllocationOrder.h" #include "RegAllocGreedy.h" +#include "RegAllocPriorityAdvisor.h" #include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/IR/Module.h" @@ -26,17 +28,18 @@ using namespace llvm; -static cl::opt Mode( +static cl::opt Mode( "regalloc-enable-advisor", cl::Hidden, - cl::init(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default), + cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values( - clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default, + clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), - clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release, + clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), - clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development, - "development", "for training"))); + clEnumValN( + RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, + "development", "for training"))); static cl::opt EnableLocalReassignment( "enable-local-reassign", cl::Hidden, @@ -59,59 +62,112 @@ cl::opt EvictInterferenceCutoff( #define LLVM_HAVE_TF_AOT #endif -char RegAllocEvictionAdvisorAnalysis::ID = 0; -INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysis, "regalloc-evict", +char RegAllocEvictionAdvisorAnalysisLegacy::ID = 0; +INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysisLegacy, "regalloc-evict", "Regalloc eviction policy", false, true) namespace { -class DefaultEvictionAdvisorAnalysis final - : public RegAllocEvictionAdvisorAnalysis { +class DefaultEvictionAdvisorProvider final + : public RegAllocEvictionAdvisorProvider { public: - DefaultEvictionAdvisorAnalysis(bool NotAsRequested) - : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Default), - NotAsRequested(NotAsRequested) {} + DefaultEvictionAdvisorProvider(bool NotAsRequested, LLVMContext &Ctx) + : RegAllocEvictionAdvisorProvider(AdvisorMode::Default, Ctx) { + if (NotAsRequested) + Ctx.emitError("Requested regalloc eviction advisor analysis " + "could not be created. Using default"); + } // support for isa<> and dyn_cast. - static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { + static bool classof(const RegAllocEvictionAdvisorProvider *R) { return R->getAdvisorMode() == AdvisorMode::Default; } -private: std::unique_ptr - getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + getAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MachineBlockFrequencyInfo *, MachineLoopInfo *) override { return std::make_unique(MF, RA); } +}; + +class DefaultEvictionAdvisorAnalysisLegacy final + : public RegAllocEvictionAdvisorAnalysisLegacy { +public: + DefaultEvictionAdvisorAnalysisLegacy(bool NotAsRequested) + : RegAllocEvictionAdvisorAnalysisLegacy(AdvisorMode::Default), + NotAsRequested(NotAsRequested) {} + bool doInitialization(Module &M) override { - if (NotAsRequested) - M.getContext().emitError("Requested regalloc eviction advisor analysis " - "could not be created. Using default"); - return RegAllocEvictionAdvisorAnalysis::doInitialization(M); + Provider.reset( + new DefaultEvictionAdvisorProvider(NotAsRequested, M.getContext())); + return false; + } + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorAnalysisLegacy *R) { + return R->getAdvisorMode() == AdvisorMode::Default; } + +private: const bool NotAsRequested; }; } // namespace -template <> Pass *llvm::callDefaultCtor() { - Pass *Ret = nullptr; +AnalysisKey RegAllocEvictionAdvisorAnalysis::Key; + +void RegAllocEvictionAdvisorAnalysis::initializeProvider( + RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode Mode, LLVMContext &Ctx) { + if (Provider) + return; + switch (Mode) { + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default: + Provider.reset( + new DefaultEvictionAdvisorProvider(/*NotAsRequested=*/false, Ctx)); + return; + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development: +#if defined(LLVM_HAVE_TFLITE) + Provider.reset(createDevelopmentModeAdvisorProvider(Ctx)); +#else + Provider.reset( + new DefaultEvictionAdvisorProvider(/*NotAsRequested=*/true, Ctx)); +#endif + return; + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release: + Provider.reset(createReleaseModeAdvisorProvider(Ctx)); + return; + } +} + +RegAllocEvictionAdvisorAnalysis::Result +RegAllocEvictionAdvisorAnalysis::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + // Lazy initialization of the provider. + initializeProvider(::Mode, MF.getFunction().getContext()); + return Result{Provider.get()}; +} + +template <> +Pass *llvm::callDefaultCtor() { switch (Mode) { - case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default: - Ret = new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ false); - break; - case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development: + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default: + return new DefaultEvictionAdvisorAnalysisLegacy(/*NotAsRequested=*/false); + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release: { + Pass *Ret = createReleaseModeAdvisorAnalysisLegacy(); + // release mode advisor may not be supported + if (Ret) + return Ret; + return new DefaultEvictionAdvisorAnalysisLegacy(/*NotAsRequested=*/true); + } + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development: #if defined(LLVM_HAVE_TFLITE) - Ret = createDevelopmentModeAdvisor(); + return createDevelopmentModeAdvisorAnalysisLegacy(); +#else + return new DefaultEvictionAdvisorAnalysisLegacy(/*NotAsRequested=*/true); #endif - break; - case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release: - Ret = createReleaseModeAdvisor(); - break; } - if (Ret) - return Ret; - return new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ true); + llvm_unreachable("unexpected advisor mode"); } -StringRef RegAllocEvictionAdvisorAnalysis::getPassName() const { +StringRef RegAllocEvictionAdvisorAnalysisLegacy::getPassName() const { switch (getAdvisorMode()) { case AdvisorMode::Default: return "Default Regalloc Eviction Advisor"; diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 7c7eb2ad52b41..9318c1df0b5e2 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -15,7 +15,6 @@ #include "AllocationOrder.h" #include "InterferenceCache.h" #include "RegAllocBase.h" -#include "RegAllocEvictionAdvisor.h" #include "RegAllocPriorityAdvisor.h" #include "SplitKit.h" #include "llvm/ADT/ArrayRef.h" @@ -46,6 +45,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/SlotIndexes.h" @@ -164,7 +164,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(SpillPlacementWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) -INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis) +INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(RegAllocPriorityAdvisorAnalysis) INITIALIZE_PASS_END(RAGreedy, "greedy", "Greedy Register Allocator", false, false) @@ -219,7 +219,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -2765,8 +2765,11 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { : TRI->reverseLocalAssignment(); ExtraInfo.emplace(); - EvictAdvisor = - getAnalysis().getAdvisor(*MF, *this); + + auto &EvictAdvisorProvider = + getAnalysis().getProvider(); + EvictAdvisor = EvictAdvisorProvider.getAdvisor(*MF, *this, MBFI, Loops); + PriorityAdvisor = getAnalysis().getAdvisor(*MF, *this); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h index e1ec63b4a5296..1d55a8241d760 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.h +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -14,7 +14,6 @@ #include "InterferenceCache.h" #include "RegAllocBase.h" -#include "RegAllocEvictionAdvisor.h" #include "RegAllocPriorityAdvisor.h" #include "SplitKit.h" #include "llvm/ADT/ArrayRef.h" diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h index 32e4598b71539..0758743c2b140 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h +++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h @@ -9,7 +9,7 @@ #ifndef LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H #define LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H -#include "RegAllocEvictionAdvisor.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/Pass.h" diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 8d5c0b3c13e01..96939f89279c6 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -127,6 +127,7 @@ #include "llvm/CodeGen/PeepholeOptimizer.h" #include "llvm/CodeGen/PostRASchedulerList.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/RegAllocFast.h" #include "llvm/CodeGen/RegUsageInfoCollector.h" #include "llvm/CodeGen/RegUsageInfoPropagate.h" From d64f177a2f4ae91cf520111dffed73f3c6b555eb Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 18 Feb 2025 12:59:15 +0100 Subject: [PATCH 115/127] [mlir][bazel] Fix `no-allow-shlib-undefined` errors. The BUILD file changes in https://github.com/llvm/llvm-project/pull/127544 adds `LinalgInterfaces` which is incomplete without `LinalgDialect`. For now, just add the `LinalgDialect` as dependency to tests which do not otherwise depend on it (but depend on `LinalgInterfaces` through e.g. `TensorDialect`). This is a temporary solution until the dependency of `TensorDialect` is trimmed to just the `linalg::RelayoutOpInterface`, but not the other linalg interfaces. See https://github.com/llvm/llvm-project/pull/127544#pullrequestreview-2622065243. --- utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel index a55c6f50118dc..d0c9f56f81cb9 100644 --- a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel @@ -176,6 +176,7 @@ cc_test( "//mlir:ArithDialect", "//mlir:FuncDialect", "//mlir:IR", + "//mlir:LinalgDialect", "//mlir:Parser", "//mlir:SCFDialect", "//mlir:SideEffectInterfaces", @@ -211,6 +212,7 @@ cc_test( "//llvm:Support", "//llvm:TestingSupport", "//mlir:IR", + "//mlir:LinalgDialect", "//mlir:SPIRVBinaryUtils", "//mlir:SPIRVDeserialization", "//mlir:SPIRVDialect", From 91ef371ae6201d481358a816d9b8fbea2ac3f565 Mon Sep 17 00:00:00 2001 From: Hyunsung Lee Date: Tue, 18 Feb 2025 21:04:50 +0900 Subject: [PATCH 116/127] [MLIR] Update `operator<<` in objects of DataFlowFramework.h (#127586) --- mlir/include/mlir/Analysis/DataFlowFramework.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h index a3714c4332fbb..6aa0900d1412a 100644 --- a/mlir/include/mlir/Analysis/DataFlowFramework.h +++ b/mlir/include/mlir/Analysis/DataFlowFramework.h @@ -146,7 +146,7 @@ struct ProgramPoint : public StorageUniquer::BaseStorage { Operation *op = nullptr; }; -inline raw_ostream &operator<<(raw_ostream &os, ProgramPoint point) { +inline raw_ostream &operator<<(raw_ostream &os, const ProgramPoint &point) { point.print(os); return os; } @@ -662,7 +662,7 @@ inline raw_ostream &operator<<(raw_ostream &os, const AnalysisState &state) { return os; } -inline raw_ostream &operator<<(raw_ostream &os, LatticeAnchor anchor) { +inline raw_ostream &operator<<(raw_ostream &os, const LatticeAnchor &anchor) { anchor.print(os); return os; } From 1c6cecdbdd2470292ce0b508922d807e3100f85c Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 18 Feb 2025 12:06:14 +0000 Subject: [PATCH 117/127] [libclc] Suppress data-layout warnings during linking (#127532) libclc uses llvm-link to link together all of the individually built libclc builtins files into one module. Some of these builtins files are compiled from source by clang whilst others are converted from LLVM IR directly to bytecode. When llvm-link links a 'source' module into a 'destination' module, it warns if the two modules have differing data layouts. The LLVM IR files libclc links either have no data layout (shared submodule files) or an explicit data layout in the case of certain amdgcn/r600 files. The warnings are very noisy and largely inconsequential. We can suppress them exploiting a specific behaviours exhibited by llvm-link. When the destination module has no data layout, it is given the source module's data layout. Thus, if we link together all IR files first, followed by the clang-compiled modules, 99% of the warnings are suppressed as they arose from linking an empty data layout into a non-empty one. The remaining warnings came from the amdgcn and r600 targets. Some of these were because the data layouts were out of date compared with what clang currently produced, so those could have been updated. However, even with those changes and by grouping the IR files together, the linker may still link explicit data layouts with empty ones depending on the order the IR files are processed. As it happens, the data layouts aren't essential. With the changes to the link line we can rely on those IR files receiving the correct data layout from the clang-compiled modules later in the link line. This also makes the previously AMDGPU-specific IR files available to be used by all targets in a generic capacity in the future. --- .../minmax_helpers.ll | 6 ------ libclc/cmake/modules/AddLibclc.cmake | 19 +++++++++++++++++-- .../lib/image/get_image_attributes_impl.ll | 2 -- libclc/r600/lib/image/read_image_impl.ll | 2 -- libclc/r600/lib/image/write_image_impl.ll | 2 -- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/libclc/amdgcn/lib/cl_khr_int64_extended_atomics/minmax_helpers.ll b/libclc/amdgcn/lib/cl_khr_int64_extended_atomics/minmax_helpers.ll index 98f1f54718a1f..7f12556c0abbc 100644 --- a/libclc/amdgcn/lib/cl_khr_int64_extended_atomics/minmax_helpers.ll +++ b/libclc/amdgcn/lib/cl_khr_int64_extended_atomics/minmax_helpers.ll @@ -1,9 +1,3 @@ -#if __clang_major__ >= 7 -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" -#else -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -#endif - define i64 @__clc__sync_fetch_and_min_global_8(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { entry: %0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %value seq_cst diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 0bf6f98452ecd..40e31e0ba4f45 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -225,7 +225,8 @@ function(add_libclc_builtin_set) message( FATAL_ERROR "Must provide ARCH, ARCH_SUFFIX, and TRIPLE" ) endif() - set( bytecode_files "" ) + set( bytecode_files ) + set( bytecode_ir_files ) foreach( file IN LISTS ARG_GEN_FILES ARG_LIB_FILES ) # We need to take each file and produce an absolute input file, as well # as a unique architecture-specific output file. We deal with a mix of @@ -263,9 +264,23 @@ function(add_libclc_builtin_set) "${ARG_COMPILE_FLAGS}" -I${CMAKE_CURRENT_SOURCE_DIR}/${file_dir} DEPENDENCIES ${input_file_dep} ) - list( APPEND bytecode_files ${output_file} ) + + # Collect all files originating in LLVM IR separately + get_filename_component( file_ext ${file} EXT ) + if( ${file_ext} STREQUAL ".ll" ) + list( APPEND bytecode_ir_files ${output_file} ) + else() + list( APPEND bytecode_files ${output_file} ) + endif() endforeach() + # Prepend all LLVM IR files to the list so they are linked into the final + # bytecode modules first. This helps to suppress unnecessary warnings + # regarding different data layouts while linking. Any LLVM IR files without a + # data layout will (silently) be given the first data layout the linking + # process comes across. + list( PREPEND bytecode_files ${bytecode_ir_files} ) + set( builtins_comp_lib_tgt builtins.comp.${ARG_ARCH_SUFFIX} ) add_custom_target( ${builtins_comp_lib_tgt} DEPENDS ${bytecode_files} diff --git a/libclc/r600/lib/image/get_image_attributes_impl.ll b/libclc/r600/lib/image/get_image_attributes_impl.ll index f867ab6603591..7f1965de7602c 100644 --- a/libclc/r600/lib/image/get_image_attributes_impl.ll +++ b/libclc/r600/lib/image/get_image_attributes_impl.ll @@ -1,5 +1,3 @@ -target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - %opencl.image2d_t = type opaque %opencl.image3d_t = type opaque diff --git a/libclc/r600/lib/image/read_image_impl.ll b/libclc/r600/lib/image/read_image_impl.ll index ca2e465b4b5b8..229a2526c3743 100644 --- a/libclc/r600/lib/image/read_image_impl.ll +++ b/libclc/r600/lib/image/read_image_impl.ll @@ -1,5 +1,3 @@ -target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - %opencl.image2d_t = type opaque declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, diff --git a/libclc/r600/lib/image/write_image_impl.ll b/libclc/r600/lib/image/write_image_impl.ll index 03595ba1db737..265f5d6045e42 100644 --- a/libclc/r600/lib/image/write_image_impl.ll +++ b/libclc/r600/lib/image/write_image_impl.ll @@ -1,5 +1,3 @@ -target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - %opencl.image2d_t = type opaque %opencl.image3d_t = type opaque From df300a4a67affc2262131486314d2ca16688eda3 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 18 Feb 2025 12:39:42 +0000 Subject: [PATCH 118/127] [llvm][docs] Fix typo in Backporting section of GitHub.rst. --- llvm/docs/GitHub.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index b5b75db91e1c4..979b87c8d02f6 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -438,7 +438,7 @@ Releases Backporting Fixes to the Release Branches ----------------------------------------- You can use special comments on issues or pull requests to make backport -requests for the release branches. To do this, after your pull reuest has been +requests for the release branches. To do this, after your pull request has been merged: 1. Edit "Milestone" at the right side of the isssue or pull request From 93d3e20bb226507c6eb777cfb15ea13f2cd129e8 Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Tue, 18 Feb 2025 12:46:33 +0000 Subject: [PATCH 119/127] [mlir][spirv] Add definition for OpKill (#126554) Although the operation is deprecated in the most recent version of the SPIR-V spec, it is still used by older shaders, so having it defined is valuable and incurs negligible maintenance overhead, due to op simplicity. --- .../mlir/Dialect/SPIRV/IR/SPIRVBase.td | 3 +- .../Dialect/SPIRV/IR/SPIRVControlFlowOps.td | 42 +++++++++++++++++++ mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp | 6 ++- .../Dialect/SPIRV/IR/control-flow-ops.mlir | 12 ++++++ .../Dialect/SPIRV/Transforms/inlining.mlir | 21 +++++++++- mlir/test/Target/SPIRV/terminator.mlir | 6 +++ 6 files changed, 87 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td index 1eacc564655a8..cafe140469570 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td @@ -4445,6 +4445,7 @@ def SPIRV_OC_OpSelectionMerge : I32EnumAttrCase<"OpSelectionMerge def SPIRV_OC_OpLabel : I32EnumAttrCase<"OpLabel", 248>; def SPIRV_OC_OpBranch : I32EnumAttrCase<"OpBranch", 249>; def SPIRV_OC_OpBranchConditional : I32EnumAttrCase<"OpBranchConditional", 250>; +def SPIRV_OC_OpKill : I32EnumAttrCase<"OpKill", 252>; def SPIRV_OC_OpReturn : I32EnumAttrCase<"OpReturn", 253>; def SPIRV_OC_OpReturnValue : I32EnumAttrCase<"OpReturnValue", 254>; def SPIRV_OC_OpUnreachable : I32EnumAttrCase<"OpUnreachable", 255>; @@ -4574,7 +4575,7 @@ def SPIRV_OpcodeAttr : SPIRV_OC_OpAtomicAnd, SPIRV_OC_OpAtomicOr, SPIRV_OC_OpAtomicXor, SPIRV_OC_OpPhi, SPIRV_OC_OpLoopMerge, SPIRV_OC_OpSelectionMerge, SPIRV_OC_OpLabel, SPIRV_OC_OpBranch, SPIRV_OC_OpBranchConditional, - SPIRV_OC_OpReturn, SPIRV_OC_OpReturnValue, SPIRV_OC_OpUnreachable, + SPIRV_OC_OpKill, SPIRV_OC_OpReturn, SPIRV_OC_OpReturnValue, SPIRV_OC_OpUnreachable, SPIRV_OC_OpGroupBroadcast, SPIRV_OC_OpGroupIAdd, SPIRV_OC_OpGroupFAdd, SPIRV_OC_OpGroupFMin, SPIRV_OC_OpGroupUMin, SPIRV_OC_OpGroupSMin, SPIRV_OC_OpGroupFMax, SPIRV_OC_OpGroupUMax, SPIRV_OC_OpGroupSMax, diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td index cc2f0e4962d8a..ade20f915c0c3 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td @@ -242,6 +242,48 @@ def SPIRV_FunctionCallOp : SPIRV_Op<"FunctionCall", [ // ----- +def SPIRV_KillOp : SPIRV_Op<"Kill", [Terminator]> { + let summary = [{ + Deprecated (use OpTerminateInvocation or OpDemoteToHelperInvocation). + }]; + + let description = [{ + Fragment-shader discard. + + Ceases all further processing in any invocation that executes it: Only + instructions these invocations executed before OpKill have observable + side effects. If this instruction is executed in non-uniform control + flow, all subsequent control flow is non-uniform (for invocations that + continue to execute). + + This instruction must be the last instruction in a block. + + This instruction is only valid in the Fragment Execution Model. + + + + #### Example: + + ```mlir + spirv.Kill + ``` + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[]>, + Capability<[SPIRV_C_Shader]> + ]; + + let arguments = (ins); + let results = (outs); + let assemblyFormat = "attr-dict"; + let hasVerifier = 0; +} + +// ----- + def SPIRV_LoopOp : SPIRV_Op<"mlir.loop", [InFunctionScope]> { let summary = "Define a structured loop."; diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp index 48be287ef833b..0cf5f0823be63 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp @@ -84,7 +84,11 @@ struct SPIRVInlinerInterface : public DialectInlinerInterface { // TODO: we need to filter OpKill here to avoid inlining it to // a loop continue construct: // https://github.com/KhronosGroup/SPIRV-Headers/issues/86 - // However OpKill is fragment shader specific and we don't support it yet. + // For now, we just disallow inlining OpKill anywhere in the code, + // but this restriction should be relaxed, as pointed above. + if (isa(op)) + return false; + return true; } diff --git a/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir b/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir index 8496448759f0c..1d1e2840a579a 100644 --- a/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir @@ -789,3 +789,15 @@ func.func @unreachable() { // expected-error @+1 {{cannot be used in reachable block}} spirv.Unreachable } + +// ----- + +//===----------------------------------------------------------------------===// +// spirv.Kill +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func @kill +func.func @kill() { + // CHECK: spirv.Kill + spirv.Kill +} diff --git a/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir b/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir index bd3c665013136..8eb48a34e61e8 100644 --- a/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir +++ b/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -split-input-file -pass-pipeline='builtin.module(spirv.module(inline{default-pipeline=''}))' | FileCheck %s +// RUN: mlir-opt %s --split-input-file --pass-pipeline='builtin.module(spirv.module(inline{default-pipeline=''}))' | FileCheck %s spirv.module Logical GLSL450 { spirv.func @callee() "None" { @@ -246,5 +246,24 @@ spirv.module Logical GLSL450 { } } +// ----- + +spirv.module Logical GLSL450 { + // CHECK-LABEL: @callee + spirv.func @callee() -> () "None" { + // CHECK-NEXT: spirv.Kill + spirv.Kill + } + + // CHECK-LABEL: @do_not_inline_kill + spirv.func @do_not_inline_kill() -> () "None" { + // CHECK-NOT: spirv.Kill + // CHECK-NEXT: spirv.FunctionCall @callee() : () -> () + spirv.FunctionCall @callee() : () -> () + // CHECK-NEXT: spirv.Return + spirv.Return + } +} + // TODO: Add tests for inlining structured control flow into // structured control flow. diff --git a/mlir/test/Target/SPIRV/terminator.mlir b/mlir/test/Target/SPIRV/terminator.mlir index 065b68b9bdfbb..8338a575681f1 100644 --- a/mlir/test/Target/SPIRV/terminator.mlir +++ b/mlir/test/Target/SPIRV/terminator.mlir @@ -24,4 +24,10 @@ spirv.module Logical GLSL450 requires #spirv.vce { // CHECK-NOT: spirv.Unreachable spirv.Unreachable } + + // CHECK-LABEL: @kill + spirv.func @kill() -> () "None" { + // CHECK: spirv.Kill + spirv.Kill + } } From eb7c947272952d40d3235d89652a10da52cb2b4d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 18 Feb 2025 19:58:42 +0700 Subject: [PATCH 120/127] AMDGPU: Correct legal literal operand logic for multiple uses (#127594) The same literal can be used multiple times in an instruction, not just once. We were not tracking the used value to verify this, so correct this. This helps avoid regressions in a future patch. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 16 ++++- .../eliminate-frame-index-v-add-co-u32.mir | 24 +++---- .../AMDGPU/fold-literal-multiple-gfx10.mir | 66 +++++++++++++++++++ llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir | 3 +- 4 files changed, 89 insertions(+), 20 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 07addb38b8711..ceab6c9dcca34 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5931,11 +5931,17 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); + const MachineOperand *UsedLiteral = nullptr; + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { - if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--) - return false; + if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) { + if (!LiteralLimit--) + return false; + + UsedLiteral = MO; + } SmallDenseSet SGPRsUsed; if (MO->isReg()) @@ -5956,6 +5962,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } } else if (AMDGPU::isSISrcOperand(InstDesc, i) && !isInlineConstant(Op, InstDesc.operands()[i])) { + // The same literal may be used multiple times. + if (!UsedLiteral) + UsedLiteral = &Op; + else if (UsedLiteral->isIdenticalTo(Op)) + continue; + if (!LiteralLimit--) return false; if (--ConstantBusLimit <= 0) diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir index 12e8d24cb3675..ade7b4266e9e6 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir @@ -2162,8 +2162,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2178,8 +2177,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2315,8 +2313,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2332,8 +2329,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2469,8 +2465,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2485,8 +2480,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2622,8 +2616,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2639,8 +2632,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir new file mode 100644 index 0000000000000..e71516e74f17e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir @@ -0,0 +1,66 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-fold-operands -o - %s | FileCheck %s + +# The same literal may be used multiple times in different operands, +# as long as it is the same value. + +--- +name: fold_multiple_same_literal_use_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: fold_multiple_same_literal_use_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, 1178657792, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657792 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, %2, 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... + +--- +name: fold_multiple_same_literal_use_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: fold_multiple_same_literal_use_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, 1178657792, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657792 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... + +--- +name: no_fold_multiple_same_literal_different_value +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: no_fold_multiple_same_literal_different_value + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1178657793, implicit $exec + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657793 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir index 268a8a4783d24..edd5d0a119e5f 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir @@ -55,8 +55,7 @@ body: | # GCN-LABEL: name: fma_sgpr_sgpr_use # GCN: %0:sgpr_32 = IMPLICIT_DEF -# GCN-NEXT: %2:vgpr_32 = V_MOV_B32_e32 1234567, implicit $exec -# GCN-NEXT: %3:vgpr_32 = V_FMAC_F32_e64 0, %0, 0, 1234567, 0, %2, 0, 0, implicit $mode, implicit $exec +# GCN: %3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, 1234567, 0, 1234567, 0, 0, implicit $mode, implicit $exec --- name: fma_sgpr_sgpr_use body: | From e5ce1d3a56676a18d1c7659f6190efcbfbb51ddd Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 18 Feb 2025 08:34:36 -0500 Subject: [PATCH 121/127] [gn] Move write_target_def_file to its own .gni file --- .../llvm/include/llvm/Config/BUILD.gn | 38 ++++--------------- .../llvm/lib/Target/write_target_def_file.gni | 36 ++++++++++++++++++ 2 files changed, 43 insertions(+), 31 deletions(-) create mode 100644 llvm/utils/gn/secondary/llvm/lib/Target/write_target_def_file.gni diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index c9f3af65a4565..5a13545a15b13 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -1,10 +1,10 @@ import("//llvm/include/llvm/Config/config.gni") import("//llvm/lib/DebugInfo/PDB/enable_dia.gni") -import("//llvm/lib/Target/targets.gni") import("//llvm/lib/Target/targets_with_asm_parsers.gni") import("//llvm/lib/Target/targets_with_disassemblers.gni") import("//llvm/lib/Target/targets_with_exegesis.gni") import("//llvm/lib/Target/targets_with_mcas.gni") +import("//llvm/lib/Target/write_target_def_file.gni") import("//llvm/triples.gni") import("//llvm/utils/gn/build/buildflags.gni") import("//llvm/utils/gn/build/libs/curl/enable.gni") @@ -477,65 +477,41 @@ write_cmake_config("llvm-config") { ############################################################################## # .def files used by llvm/lib/Target -template("write_target_def_file") { - assert(defined(invoker.key), "must set 'key' in $target_name") - assert(defined(invoker.value), "must set 'value' in $target_name") - - write_cmake_config(target_name) { - visibility = [ ":write_target_def_files" ] - input = "$target_name.in" - output = "$target_gen_dir/$target_name" - - if (defined(invoker.all_targets)) { - all_targets = invoker.all_targets - } else { - all_targets = llvm_targets_to_build - } - - # Build something like - # `LLVM_ENUM_ASM_PARSERS=LLVM_ASM_PARSER(ARM)\nLLVM_ASM_PARSER(X86)\n`. Note - # that \n is a literal '\' followed by a literal 'n', not a newline - # character. (write_cmake_config.py replaces that with a real newline). - value = "" - foreach(target, all_targets) { - value = "$value${invoker.value}($target)\n" - } - if (all_targets == []) { - not_needed(invoker, [ "value" ]) - } - values = [ "${invoker.key}=$value" ] - } -} - write_target_def_file("AsmParsers.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_ASM_PARSERS" value = "LLVM_ASM_PARSER" all_targets = targets_with_asm_parsers } write_target_def_file("AsmPrinters.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_ASM_PRINTERS" value = "LLVM_ASM_PRINTER" } write_target_def_file("Disassemblers.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_DISASSEMBLERS" value = "LLVM_DISASSEMBLER" all_targets = targets_with_disassemblers } write_target_def_file("Targets.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_TARGETS" value = "LLVM_TARGET" } write_target_def_file("TargetMCAs.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_TARGETMCAS" value = "LLVM_TARGETMCA" all_targets = targets_with_mcas } write_target_def_file("TargetExegesis.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_EXEGESIS" value = "LLVM_EXEGESIS" all_targets = targets_with_exegesis diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/write_target_def_file.gni b/llvm/utils/gn/secondary/llvm/lib/Target/write_target_def_file.gni new file mode 100644 index 0000000000000..8ff5edeb41f3d --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/lib/Target/write_target_def_file.gni @@ -0,0 +1,36 @@ +import("//llvm/lib/Target/targets.gni") +import("//llvm/utils/gn/build/write_cmake_config.gni") + +template("write_target_def_file") { + assert(defined(invoker.key), "must set 'key' in $target_name") + assert(defined(invoker.value), "must set 'value' in $target_name") + + write_cmake_config(target_name) { + input = "$target_name.in" + output = "$target_gen_dir/$target_name" + + if (defined(invoker.all_targets)) { + all_targets = invoker.all_targets + } else { + all_targets = llvm_targets_to_build + } + + if (defined(invoker.visibility)) { + visibility = invoker.visibility + } + + # Build something like + # `LLVM_ENUM_ASM_PARSERS=LLVM_ASM_PARSER(ARM)\nLLVM_ASM_PARSER(X86)\n`. Note + # that \n is a literal '\' followed by a literal 'n', not a newline + # character. (write_cmake_config.py replaces that with a real newline). + value = "" + foreach(target, all_targets) { + value = "$value${invoker.value}($target)\n" + } + if (all_targets == []) { + not_needed(invoker, [ "value" ]) + } + values = [ "${invoker.key}=$value" ] + } +} + From 09c2441037efeaa2980da4bb24286d0684ba99b5 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 18 Feb 2025 08:41:49 -0500 Subject: [PATCH 122/127] [gn] port e235fcb582ee (bolt TargetConfig.def) --- llvm/utils/gn/secondary/bolt/include/bolt/Core/BUILD.gn | 6 ++++++ llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn | 1 + llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn | 1 + llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn | 1 + 4 files changed, 9 insertions(+) create mode 100644 llvm/utils/gn/secondary/bolt/include/bolt/Core/BUILD.gn diff --git a/llvm/utils/gn/secondary/bolt/include/bolt/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/include/bolt/Core/BUILD.gn new file mode 100644 index 0000000000000..2d7c1a70abe95 --- /dev/null +++ b/llvm/utils/gn/secondary/bolt/include/bolt/Core/BUILD.gn @@ -0,0 +1,6 @@ +import("//llvm/lib/Target/write_target_def_file.gni") + +write_target_def_file("TargetConfig.def") { + key = "BOLT_ENUM_TARGETS" + value = "BOLT_TARGET" +} diff --git a/llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn b/llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn index 004a7359698de..c174bf3c613f4 100644 --- a/llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn @@ -23,6 +23,7 @@ group("symlinks") { executable("llvm-bolt") { configs += [ "//llvm/utils/gn/build:bolt_code" ] deps = [ + "//bolt/include/bolt/Core:TargetConfig.def", "//bolt/lib/Profile", "//bolt/lib/Rewrite", "//bolt/lib/Utils", diff --git a/llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn b/llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn index b6270106dbaf8..78b65a12e945a 100644 --- a/llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn @@ -1,6 +1,7 @@ executable("llvm-bolt-heatmap") { configs += [ "//llvm/utils/gn/build:bolt_code" ] deps = [ + "//bolt/include/bolt/Core:TargetConfig.def", "//bolt/lib/Profile", "//bolt/lib/Rewrite", "//bolt/lib/Utils", diff --git a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn index c7c9459fdff16..79f19a416c0e1 100644 --- a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn @@ -4,6 +4,7 @@ import("//third-party/unittest/unittest.gni") unittest("CoreTests") { configs += [ "//llvm/utils/gn/build:bolt_code" ] deps = [ + "//bolt/include/bolt/Core:TargetConfig.def", "//bolt/lib/Core", "//bolt/lib/Rewrite", "//bolt/lib/Profile", From 5fbb6d919d528d54538df3330e76f220ff52ab30 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 18 Feb 2025 14:43:35 +0100 Subject: [PATCH 123/127] [clang][bytecode] Allow up/down casts of nullptr (#127615) If the target type is a pointer type. --- clang/lib/AST/ByteCode/Compiler.cpp | 6 ++++-- clang/lib/AST/ByteCode/Interp.cpp | 2 +- clang/lib/AST/ByteCode/Interp.h | 19 +++++++++++++++---- clang/lib/AST/ByteCode/Opcodes.td | 6 ++---- clang/test/AST/ByteCode/records.cpp | 18 +++++++++++++++++- 5 files changed, 39 insertions(+), 12 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index b3a81f8ff1516..503c58a67adeb 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -272,7 +272,8 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { CurType = B->getType(); } else { unsigned DerivedOffset = collectBaseOffset(B->getType(), CurType); - if (!this->emitGetPtrBasePop(DerivedOffset, CE)) + if (!this->emitGetPtrBasePop( + DerivedOffset, /*NullOK=*/CE->getType()->isPointerType(), CE)) return false; CurType = B->getType(); } @@ -288,7 +289,8 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { unsigned DerivedOffset = collectBaseOffset(SubExpr->getType(), CE->getType()); - return this->emitGetPtrDerivedPop(DerivedOffset, CE); + return this->emitGetPtrDerivedPop( + DerivedOffset, /*NullOK=*/CE->getType()->isPointerType(), CE); } case CK_FloatingCast: { diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 0310870f7372e..c07690a3d941c 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1433,7 +1433,7 @@ bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func, unsigned Offset = S.getContext().collectBaseOffset( InitialPointeeType->getAsRecordDecl(), OverriderPointeeType->getAsRecordDecl()); - return GetPtrBasePop(S, OpPC, Offset); + return GetPtrBasePop(S, OpPC, Offset, /*IsNullOK=*/true); } return true; diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 10cf21e28437c..ca74046038072 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1568,10 +1568,20 @@ inline bool GetPtrActiveThisField(InterpState &S, CodePtr OpPC, uint32_t Off) { return true; } -inline bool GetPtrDerivedPop(InterpState &S, CodePtr OpPC, uint32_t Off) { +inline bool GetPtrDerivedPop(InterpState &S, CodePtr OpPC, uint32_t Off, + bool NullOK) { const Pointer &Ptr = S.Stk.pop(); - if (!CheckNull(S, OpPC, Ptr, CSK_Derived)) + if (!NullOK && !CheckNull(S, OpPC, Ptr, CSK_Derived)) return false; + + if (!Ptr.isBlockPointer()) { + // FIXME: We don't have the necessary information in integral pointers. + // The Descriptor only has a record, but that does of course not include + // the potential derived classes of said record. + S.Stk.push(Ptr); + return true; + } + if (!CheckSubobject(S, OpPC, Ptr, CSK_Derived)) return false; if (!CheckDowncast(S, OpPC, Ptr, Off)) @@ -1600,10 +1610,11 @@ inline bool GetPtrBase(InterpState &S, CodePtr OpPC, uint32_t Off) { return true; } -inline bool GetPtrBasePop(InterpState &S, CodePtr OpPC, uint32_t Off) { +inline bool GetPtrBasePop(InterpState &S, CodePtr OpPC, uint32_t Off, + bool NullOK) { const Pointer &Ptr = S.Stk.pop(); - if (!CheckNull(S, OpPC, Ptr, CSK_Base)) + if (!NullOK && !CheckNull(S, OpPC, Ptr, CSK_Base)) return false; if (!Ptr.isBlockPointer()) { diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 088a3e40fe2a7..41e4bae65c195 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -312,7 +312,7 @@ def GetPtrThisField : OffsetOpcode; // [Pointer] -> [Pointer] def GetPtrBase : OffsetOpcode; // [Pointer] -> [Pointer] -def GetPtrBasePop : OffsetOpcode; +def GetPtrBasePop : OffsetOpcode { let Args = [ArgUint32, ArgBool]; } def GetMemberPtrBasePop : Opcode { // Offset of field, which is a base. let Args = [ArgSint32]; @@ -322,9 +322,7 @@ def GetMemberPtrBasePop : Opcode { def FinishInitPop : Opcode; def FinishInit : Opcode; -def GetPtrDerivedPop : Opcode { - let Args = [ArgUint32]; -} +def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool]; } // [Pointer] -> [Pointer] def GetPtrVirtBasePop : Opcode { diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp index 9470e7d8e3dcb..3cc3210841e0f 100644 --- a/clang/test/AST/ByteCode/records.cpp +++ b/clang/test/AST/ByteCode/records.cpp @@ -1656,12 +1656,28 @@ namespace ExprWithCleanups { static_assert(F == 1i, ""); } -namespace NullptrUpcast { +namespace NullptrCast { struct A {}; struct B : A { int n; }; + constexpr A *na = nullptr; constexpr B *nb = nullptr; constexpr A &ra = *nb; // both-error {{constant expression}} \ // both-note {{cannot access base class of null pointer}} + constexpr B &rb = (B&)*na; // both-error {{constant expression}} \ + // both-note {{cannot access derived class of null pointer}} + constexpr bool test() { + auto a = (A*)(B*)nullptr; + + return a == nullptr; + } + static_assert(test(), ""); + + constexpr bool test2() { + auto a = (B*)(A*)nullptr; + + return a == nullptr; + } + static_assert(test2(), ""); } namespace NonConst { From 7e2707ad4673869fcca119a0ad8bd25aa38a5503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Tue, 18 Feb 2025 13:57:49 +0000 Subject: [PATCH 124/127] [mlir][nfc] Add a negative test for --linalg-specialize-generic-ops (#127600) Following on from #126737, adds a negative test that: * prior to #126737, would incorrectly generated empty output, * with the fix in-tree, simply outputs the input IR (i.e. the specialization "fails"). I've also made minor editorial changes. --- .../Linalg/specialize-generic-ops-fail.mlir | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir index 542a7ed4a198b..357f2c11a7936 100644 --- a/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir +++ b/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir @@ -6,11 +6,26 @@ // CHECK-LABEL: @transpose_and_broadcast // CHECK: linalg.generic func.func @transpose_and_broadcast(%arg0: tensor<7x8xf32>, %arg1: tensor<8x7x9xf32>) -> tensor<8x7x9xf32> { - %0 = linalg.generic - {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} - ins(%arg0 : tensor<7x8xf32>) outs(%arg1 : tensor<8x7x9xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 + %res = linalg.generic { + indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"] + } ins(%arg0 : tensor<7x8xf32>) outs(%arg1 : tensor<8x7x9xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 } -> tensor<8x7x9xf32> - return %0 : tensor<8x7x9xf32> + return %res : tensor<8x7x9xf32> +} + +// ----- + +#map = affine_map<(d0) -> (d0)> +// CHECK-LABEL: @neither_permutation_nor_broadcast +// CHECK: linalg.generic +func.func @neither_permutation_nor_broadcast(%init : tensor<8xi32>) -> tensor<8xi32> { + %res = linalg.generic { + indexing_maps = [#map], iterator_types = ["parallel"] + } outs(%init: tensor<8xi32>) { + ^bb0(%out: i32): + linalg.yield %out: i32 + } -> tensor<8xi32> + return %res : tensor<8xi32> } From 3b6cc94e7410b818658693885d4f5857c2fdbc6b Mon Sep 17 00:00:00 2001 From: Abhina Sree Date: Tue, 18 Feb 2025 09:09:10 -0500 Subject: [PATCH 125/127] [SystemZ][z/OS] Mark text files as text in ClangScanDeps (#127514) This patch continues the work that was started here https://reviews.llvm.org/D99426 to correctly open text files in text mode. --- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 9cdb1eae56187..3bdeb461e4bfa 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -298,12 +298,14 @@ class ResourceDirectoryCache { }; if (llvm::sys::ExecuteAndWait(ClangBinaryPath, PrintResourceDirArgs, {}, Redirects)) { - auto ErrorBuf = llvm::MemoryBuffer::getFile(ErrorFile.c_str()); + auto ErrorBuf = + llvm::MemoryBuffer::getFile(ErrorFile.c_str(), /*IsText=*/true); llvm::errs() << ErrorBuf.get()->getBuffer(); return ""; } - auto OutputBuf = llvm::MemoryBuffer::getFile(OutputFile.c_str()); + auto OutputBuf = + llvm::MemoryBuffer::getFile(OutputFile.c_str(), /*IsText=*/true); if (!OutputBuf) return ""; StringRef Output = OutputBuf.get()->getBuffer().rtrim('\n'); @@ -1032,7 +1034,8 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { std::unique_ptr TU; std::optional TUBuffer; if (!TranslationUnitFile.empty()) { - auto MaybeTU = llvm::MemoryBuffer::getFile(TranslationUnitFile); + auto MaybeTU = + llvm::MemoryBuffer::getFile(TranslationUnitFile, /*IsText=*/true); if (!MaybeTU) { llvm::errs() << "cannot open input translation unit: " << MaybeTU.getError().message() << "\n"; From 0d666598a06420d1c59f3b02ef5022ec9af39b0d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 18 Feb 2025 15:12:02 +0100 Subject: [PATCH 126/127] [BasicAA] Add test for #126670 (NFC) --- .../BasicAA/escape-source-aggregate.ll | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 llvm/test/Analysis/BasicAA/escape-source-aggregate.ll diff --git a/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll new file mode 100644 index 0000000000000..cef11b94f3873 --- /dev/null +++ b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll @@ -0,0 +1,24 @@ +; RUN: opt -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 < %s | FileCheck %s + +declare { ptr, i1 } @get_struct() +declare <2 x ptr> @get_vec() + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractvalue() { + %a = alloca i32 + %call = call { ptr, i1 } @get_struct() + %extract = extractvalue { ptr, i1 } %call, 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractelement() { + %a = alloca i32 + %call = call <2 x ptr> @get_vec() + %extract = extractelement <2 x ptr> %call, i32 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} From 27fe2c95ee067ee013b947040538224187b3adb7 Mon Sep 17 00:00:00 2001 From: Danial Klimkin Date: Tue, 18 Feb 2025 15:12:49 +0100 Subject: [PATCH 127/127] [bazel]Move HAVE_GETAUXVAL from config.h to config.bzl (#127637) This fixes build errors on mac OS. --- utils/bazel/llvm-project-overlay/llvm/config.bzl | 1 + .../llvm-project-overlay/llvm/include/llvm/Config/config.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl index 6e703d22e7756..fa616bcb9a8c9 100644 --- a/utils/bazel/llvm-project-overlay/llvm/config.bzl +++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl @@ -47,6 +47,7 @@ posix_defines = [ linux_defines = posix_defines + [ "_GNU_SOURCE", + "HAVE_GETAUXVAL=1", "HAVE_MALLINFO=1", "HAVE_SBRK=1", "HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC=1", diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h index 93695f8e26d27..3ef1d0c4b1651 100644 --- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h +++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h @@ -296,7 +296,7 @@ /* HAVE_PROC_PID_RUSAGE defined in Bazel */ -#define HAVE_GETAUXVAL 1 +/* HAVE_GETAUXVAL defined in Bazel */ /* Directly provide definitions here behind platform preprocessor definitions. * The preprocessor conditions are sufficient to handle all of the configuration