diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 04db160b64b05..f5ffa81227064 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -202,3 +202,11 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/bolt/RuntimeLibs/RuntimeLibraryVariables.inc.in ${CMAKE_CURRENT_BINARY_DIR}/include/bolt/RuntimeLibs/RuntimeLibraryVariables.inc @ONLY) + +set(BOLT_ENUM_TARGETS "") +foreach(t ${BOLT_TARGETS_TO_BUILD}) + set(BOLT_ENUM_TARGETS "${BOLT_ENUM_TARGETS}BOLT_TARGET(${t})\n") +endforeach(t) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/bolt/Core/TargetConfig.def.in + ${CMAKE_CURRENT_BINARY_DIR}/include/bolt/Core/TargetConfig.def @ONLY) diff --git a/bolt/include/bolt/Core/TargetConfig.def.in b/bolt/include/bolt/Core/TargetConfig.def.in new file mode 100644 index 0000000000000..a52ebd92b56fd --- /dev/null +++ b/bolt/include/bolt/Core/TargetConfig.def.in @@ -0,0 +1,23 @@ +//===-- TargetConfig.def.in - Information about available targets ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is configured by the build system to define the available bolt +// targets. +// +// The variant of this file not ending with .in has been autogenerated by the +// LLVM build. Do not edit! +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_TARGET +# error Please define the macro BOLT_TARGET(TargetName) +#endif + +@BOLT_ENUM_TARGETS@ + +#undef BOLT_TARGET diff --git a/bolt/tools/binary-analysis/CMakeLists.txt b/bolt/tools/binary-analysis/CMakeLists.txt index 841fc5b371185..29f224e0f66ff 100644 --- a/bolt/tools/binary-analysis/CMakeLists.txt +++ b/bolt/tools/binary-analysis/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} MC Object Support diff --git a/bolt/tools/binary-analysis/binary-analysis.cpp b/bolt/tools/binary-analysis/binary-analysis.cpp index b03fee3e025ae..0e3584eeedd18 100644 --- a/bolt/tools/binary-analysis/binary-analysis.cpp +++ b/bolt/tools/binary-analysis/binary-analysis.cpp @@ -88,13 +88,15 @@ int main(int argc, char **argv) { llvm_shutdown_obj Y; // Call llvm_shutdown() on exit. // Initialize targets and assembly printers/parsers. - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" ParseCommandLine(argc, argv); diff --git a/bolt/tools/driver/CMakeLists.txt b/bolt/tools/driver/CMakeLists.txt index 9bf9ff85edc7b..4b3c7416de974 100644 --- a/bolt/tools/driver/CMakeLists.txt +++ b/bolt/tools/driver/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} MC Object Support diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp index f151cf5f63fc5..6b6714723fa3b 100644 --- a/bolt/tools/driver/llvm-bolt.cpp +++ b/bolt/tools/driver/llvm-bolt.cpp @@ -183,13 +183,15 @@ int main(int argc, char **argv) { std::string ToolPath = llvm::sys::fs::getMainExecutable(argv[0], nullptr); // Initialize targets and assembly printers/parsers. - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" ToolName = argv[0]; diff --git a/bolt/tools/heatmap/CMakeLists.txt b/bolt/tools/heatmap/CMakeLists.txt index acddc7a50e8b1..c5d3f67413929 100644 --- a/bolt/tools/heatmap/CMakeLists.txt +++ b/bolt/tools/heatmap/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} MC Object Support diff --git a/bolt/tools/heatmap/heatmap.cpp b/bolt/tools/heatmap/heatmap.cpp index 3bb9f2ce7491d..6add36cc6715f 100644 --- a/bolt/tools/heatmap/heatmap.cpp +++ b/bolt/tools/heatmap/heatmap.cpp @@ -76,13 +76,15 @@ int main(int argc, char **argv) { opts::OutputFilename = "-"; // Initialize targets and assembly printers/parsers. - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" ToolName = argv[0]; std::string ToolPath = GetExecutablePath(argv[0]); diff --git a/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt b/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt index f21285f634bad..7eaacb74a9da6 100644 --- a/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt +++ b/bolt/tools/llvm-bolt-fuzzer/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} ) add_llvm_fuzzer(llvm-bolt-fuzzer diff --git a/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp b/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp index bdb5768a91da1..09049788aebec 100644 --- a/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp +++ b/bolt/tools/llvm-bolt-fuzzer/llvm-bolt-fuzzer.cpp @@ -58,13 +58,16 @@ extern "C" int LLVMFuzzerTestOneInput(const char *Data, size_t Size) { extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(int *argc, char ***argv) { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); + // Initialize targets and assembly printers/parsers. +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#include "bolt/Core/TargetConfig.def" return 0; } diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp index 0fefa1b83c3c2..09d16966334da 100644 --- a/bolt/unittests/Core/BinaryContext.cpp +++ b/bolt/unittests/Core/BinaryContext.cpp @@ -27,12 +27,15 @@ struct BinaryContextTester : public testing::TestWithParam { protected: void initalizeLLVM() { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" } void prepareElf() { diff --git a/bolt/unittests/Core/CMakeLists.txt b/bolt/unittests/Core/CMakeLists.txt index 208cf6ced7358..8ac88b701ea05 100644 --- a/bolt/unittests/Core/CMakeLists.txt +++ b/bolt/unittests/Core/CMakeLists.txt @@ -2,7 +2,7 @@ set(LLVM_LINK_COMPONENTS DebugInfoDWARF Object MC - ${LLVM_TARGETS_TO_BUILD} + ${BOLT_TARGETS_TO_BUILD} ) add_bolt_unittest(CoreTests diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp index 5488cae366284..d367eb07f7767 100644 --- a/bolt/unittests/Core/MCPlusBuilder.cpp +++ b/bolt/unittests/Core/MCPlusBuilder.cpp @@ -37,12 +37,15 @@ struct MCPlusBuilderTester : public testing::TestWithParam { protected: void initalizeLLVM() { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" } void prepareElf() { diff --git a/bolt/unittests/Core/MemoryMaps.cpp b/bolt/unittests/Core/MemoryMaps.cpp index 06073d0a82e14..2e1bc4d280aed 100644 --- a/bolt/unittests/Core/MemoryMaps.cpp +++ b/bolt/unittests/Core/MemoryMaps.cpp @@ -38,12 +38,15 @@ struct MemoryMapsTester : public testing::TestWithParam { protected: void initalizeLLVM() { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllDisassemblers(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); +#define BOLT_TARGET(target) \ + LLVMInitialize##target##TargetInfo(); \ + LLVMInitialize##target##TargetMC(); \ + LLVMInitialize##target##AsmParser(); \ + LLVMInitialize##target##Disassembler(); \ + LLVMInitialize##target##Target(); \ + LLVMInitialize##target##AsmPrinter(); + +#include "bolt/Core/TargetConfig.def" } void prepareElf() { diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h index e030bf04122d5..1e612e2ba618e 100644 --- a/clang-tools-extra/clangd/ClangdServer.h +++ b/clang-tools-extra/clangd/ClangdServer.h @@ -184,7 +184,7 @@ class ClangdServer { bool UseDirtyHeaders = false; // If true, parse emplace-like functions in the preamble. - bool PreambleParseForwardingFunctions = false; + bool PreambleParseForwardingFunctions = true; /// Whether include fixer insertions for Objective-C code should use #import /// instead of #include. @@ -501,7 +501,7 @@ class ClangdServer { // Whether the client supports folding only complete lines. bool LineFoldingOnly = false; - bool PreambleParseForwardingFunctions = false; + bool PreambleParseForwardingFunctions = true; bool ImportInsertions = false; diff --git a/clang-tools-extra/clangd/Compiler.h b/clang-tools-extra/clangd/Compiler.h index 4e68da7610ca2..e513e4c40794a 100644 --- a/clang-tools-extra/clangd/Compiler.h +++ b/clang-tools-extra/clangd/Compiler.h @@ -40,7 +40,7 @@ class IgnoreDiagnostics : public DiagnosticConsumer { // Options to run clang e.g. when parsing AST. struct ParseOptions { - bool PreambleParseForwardingFunctions = false; + bool PreambleParseForwardingFunctions = true; bool ImportInsertions = false; }; diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6272f32fa845a..a91c764860ccd 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -173,6 +173,7 @@ Bug Fixes to C++ Support Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ +- Fixed type checking when a statement expression ends in an l-value of atomic type. (#GH106576) Miscellaneous Bug Fixes ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index a96b9c0a17045..d275873651786 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -287,8 +287,8 @@ class ASTContext : public RefCountedBase { /// This is lazily created. This is intentionally not serialized. mutable llvm::DenseMap ASTRecordLayouts; - mutable llvm::DenseMap - ObjCLayouts; + mutable llvm::DenseMap + ObjCLayouts; /// A cache from types to size and alignment information. using TypeInfoMap = llvm::DenseMap; @@ -500,6 +500,11 @@ class ASTContext : public RefCountedBase { static constexpr unsigned GeneralTypesLog2InitSize = 9; static constexpr unsigned FunctionProtoTypesLog2InitSize = 12; + /// A mapping from an ObjC class to its subclasses. + llvm::DenseMap> + ObjCSubClasses; + ASTContext &this_() { return *this; } public: @@ -2671,13 +2676,6 @@ class ASTContext : public RefCountedBase { void DumpRecordLayout(const RecordDecl *RD, raw_ostream &OS, bool Simple = false) const; - /// Get or compute information about the layout of the specified - /// Objective-C implementation. - /// - /// This may differ from the interface if synthesized ivars are present. - const ASTRecordLayout & - getASTObjCImplementationLayout(const ObjCImplementationDecl *D) const; - /// Get our current best idea for the key function of the /// given record decl, or nullptr if there isn't one. /// @@ -2716,7 +2714,6 @@ class ASTContext : public RefCountedBase { /// Get the offset of an ObjCIvarDecl in bits. uint64_t lookupFieldBitOffset(const ObjCInterfaceDecl *OID, - const ObjCImplementationDecl *ID, const ObjCIvarDecl *Ivar) const; /// Find the 'this' offset for the member path in a pointer-to-member @@ -3174,7 +3171,12 @@ class ASTContext : public RefCountedBase { bool &CanUseFirst, bool &CanUseSecond, SmallVectorImpl &NewParamInfos); - void ResetObjCLayout(const ObjCContainerDecl *CD); + void ResetObjCLayout(const ObjCInterfaceDecl *D); + + void addObjCSubClass(const ObjCInterfaceDecl *D, + const ObjCInterfaceDecl *SubClass) { + ObjCSubClasses[D].push_back(SubClass); + } //===--------------------------------------------------------------------===// // Integer Predicates @@ -3564,9 +3566,7 @@ OPT_LIST(V) friend class DeclarationNameTable; friend class DeclContext; - const ASTRecordLayout & - getObjCLayout(const ObjCInterfaceDecl *D, - const ObjCImplementationDecl *Impl) const; + const ASTRecordLayout &getObjCLayout(const ObjCInterfaceDecl *D) const; /// A set of deallocations that should be performed when the /// ASTContext is destroyed. diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 766821b4fb25c..266b93a64a390 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -4194,8 +4194,8 @@ class BindingDecl : public ValueDecl { /// decomposition declaration, and when the initializer is type-dependent. Expr *getBinding() const { return Binding; } - // Get the array of Exprs when the binding represents a pack. - llvm::ArrayRef getBindingPackExprs() const; + // Get the array of nested BindingDecls when the binding represents a pack. + llvm::ArrayRef getBindingPackDecls() const; /// Get the decomposition declaration that this binding represents a /// decomposition of. @@ -4246,10 +4246,8 @@ class DecompositionDecl final for (auto *B : Bindings) { B->setDecomposedDecl(this); if (B->isParameterPack() && B->getBinding()) { - for (Expr *E : B->getBindingPackExprs()) { - auto *DRE = cast(E); - auto *NestedB = cast(DRE->getDecl()); - NestedB->setDecomposedDecl(this); + for (BindingDecl *NestedBD : B->getBindingPackDecls()) { + NestedBD->setDecomposedDecl(this); } } } @@ -4278,25 +4276,21 @@ class DecompositionDecl final // Provide a flattened range to visit each binding. auto flat_bindings() const { llvm::ArrayRef Bindings = bindings(); - llvm::ArrayRef PackExprs; + llvm::ArrayRef PackBindings; // Split the bindings into subranges split by the pack. - auto S1 = Bindings.take_until( + llvm::ArrayRef BeforePackBindings = Bindings.take_until( [](BindingDecl *BD) { return BD->isParameterPack(); }); - Bindings = Bindings.drop_front(S1.size()); + Bindings = Bindings.drop_front(BeforePackBindings.size()); if (!Bindings.empty()) { - PackExprs = Bindings.front()->getBindingPackExprs(); + PackBindings = Bindings.front()->getBindingPackDecls(); Bindings = Bindings.drop_front(); } - auto S2 = llvm::map_range(PackExprs, [](Expr *E) { - auto *DRE = cast(E); - return cast(DRE->getDecl()); - }); - - return llvm::concat(std::move(S1), std::move(S2), - std::move(Bindings)); + return llvm::concat(std::move(BeforePackBindings), + std::move(PackBindings), + std::move(Bindings)); } void printName(raw_ostream &OS, const PrintingPolicy &Policy) const override; diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 98ba2bb41bb54..abc65e77da021 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4633,8 +4633,8 @@ class SubstNonTypeTemplateParmPackExpr : public Expr { } }; -/// Represents a reference to a function parameter pack or init-capture pack -/// that has been substituted but not yet expanded. +/// Represents a reference to a function parameter pack, init-capture pack, +/// or binding pack that has been substituted but not yet expanded. /// /// When a pack expansion contains multiple parameter packs at different levels, /// this node is used to represent a function parameter pack at an outer level @@ -4649,13 +4649,13 @@ class SubstNonTypeTemplateParmPackExpr : public Expr { /// \endcode class FunctionParmPackExpr final : public Expr, - private llvm::TrailingObjects { + private llvm::TrailingObjects { friend class ASTReader; friend class ASTStmtReader; friend TrailingObjects; /// The function parameter pack which was referenced. - VarDecl *ParamPack; + ValueDecl *ParamPack; /// The location of the function parameter pack reference. SourceLocation NameLoc; @@ -4663,35 +4663,34 @@ class FunctionParmPackExpr final /// The number of expansions of this pack. unsigned NumParameters; - FunctionParmPackExpr(QualType T, VarDecl *ParamPack, - SourceLocation NameLoc, unsigned NumParams, - VarDecl *const *Params); + FunctionParmPackExpr(QualType T, ValueDecl *ParamPack, SourceLocation NameLoc, + unsigned NumParams, ValueDecl *const *Params); public: static FunctionParmPackExpr *Create(const ASTContext &Context, QualType T, - VarDecl *ParamPack, + ValueDecl *ParamPack, SourceLocation NameLoc, - ArrayRef Params); + ArrayRef Params); static FunctionParmPackExpr *CreateEmpty(const ASTContext &Context, unsigned NumParams); /// Get the parameter pack which this expression refers to. - VarDecl *getParameterPack() const { return ParamPack; } + ValueDecl *getParameterPack() const { return ParamPack; } /// Get the location of the parameter pack. SourceLocation getParameterPackLocation() const { return NameLoc; } /// Iterators over the parameters which the parameter pack expanded /// into. - using iterator = VarDecl * const *; - iterator begin() const { return getTrailingObjects(); } + using iterator = ValueDecl *const *; + iterator begin() const { return getTrailingObjects(); } iterator end() const { return begin() + NumParameters; } /// Get the number of parameters in this parameter pack. unsigned getNumExpansions() const { return NumParameters; } /// Get an expansion of the parameter pack by index. - VarDecl *getExpansion(unsigned I) const { return begin()[I]; } + ValueDecl *getExpansion(unsigned I) const { return begin()[I]; } SourceLocation getBeginLoc() const LLVM_READONLY { return NameLoc; } SourceLocation getEndLoc() const LLVM_READONLY { return NameLoc; } @@ -5319,59 +5318,6 @@ class BuiltinBitCastExpr final } }; -// Represents an unexpanded pack where the list of expressions are -// known. These are used when structured bindings introduce a pack. -class ResolvedUnexpandedPackExpr final - : public Expr, - private llvm::TrailingObjects { - friend class ASTStmtReader; - friend class ASTStmtWriter; - friend TrailingObjects; - - SourceLocation BeginLoc; - unsigned NumExprs; - - ResolvedUnexpandedPackExpr(SourceLocation BL, QualType QT, unsigned NumExprs); - -public: - static ResolvedUnexpandedPackExpr *CreateDeserialized(ASTContext &C, - unsigned NumExprs); - static ResolvedUnexpandedPackExpr * - Create(ASTContext &C, SourceLocation BeginLoc, QualType T, unsigned NumExprs); - static ResolvedUnexpandedPackExpr *Create(ASTContext &C, - SourceLocation BeginLoc, QualType T, - llvm::ArrayRef Exprs); - - unsigned getNumExprs() const { return NumExprs; } - - llvm::MutableArrayRef getExprs() { - return {getTrailingObjects(), NumExprs}; - } - - llvm::ArrayRef getExprs() const { - return {getTrailingObjects(), NumExprs}; - } - - Expr *getExpansion(unsigned Idx) { return getExprs()[Idx]; } - Expr *getExpansion(unsigned Idx) const { return getExprs()[Idx]; } - - // Iterators - child_range children() { - return child_range((Stmt **)getTrailingObjects(), - (Stmt **)getTrailingObjects() + getNumExprs()); - } - - SourceLocation getBeginLoc() const LLVM_READONLY { return BeginLoc; } - SourceLocation getEndLoc() const LLVM_READONLY { return BeginLoc; } - - // Returns the resolved pack of a decl or nullptr - static ResolvedUnexpandedPackExpr *getFromDecl(Decl *); - - static bool classof(const Stmt *T) { - return T->getStmtClass() == ResolvedUnexpandedPackExprClass; - } -}; - } // namespace clang #endif // LLVM_CLANG_AST_EXPRCXX_H diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 560de7da9913a..5964cbaec8e44 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -2950,7 +2950,6 @@ DEF_TRAVERSE_STMT(FunctionParmPackExpr, {}) DEF_TRAVERSE_STMT(CXXFoldExpr, {}) DEF_TRAVERSE_STMT(AtomicExpr, {}) DEF_TRAVERSE_STMT(CXXParenListInitExpr, {}) -DEF_TRAVERSE_STMT(ResolvedUnexpandedPackExpr, {}) DEF_TRAVERSE_STMT(MaterializeTemporaryExpr, { if (S->getLifetimeExtendedTemporaryDecl()) { diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index bfab0baa089cf..383440ddbc0ea 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -365,6 +365,7 @@ LANGOPT(ObjCDisableDirectMethodsForTesting, 1, 0, LANGOPT(CFProtectionBranch , 1, 0, "Control-Flow Branch Protection enabled") ENUM_LANGOPT(CFBranchLabelScheme, CFBranchLabelSchemeKind, 2, CFBranchLabelSchemeKind::Default, "Control-Flow Branch Protection Label Scheme") +LANGOPT(CFProtectionReturn, 1, 0, "Control-Flow Return Protection enabled") LANGOPT(FakeAddressSpaceMap , 1, 0, "OpenCL fake address space map") ENUM_LANGOPT(AddressSpaceMapMangling , AddrSpaceMapMangling, 2, ASMM_Target, "OpenCL address space map mangling mode") LANGOPT(IncludeDefaultHeader, 1, 0, "Include default header file for OpenCL") diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td index 3533c5f50742e..ae49671058a01 100644 --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -163,7 +163,6 @@ def MaterializeTemporaryExpr : StmtNode; def LambdaExpr : StmtNode; def CXXFoldExpr : StmtNode; def CXXParenListInitExpr: StmtNode; -def ResolvedUnexpandedPackExpr : StmtNode; // C++ Coroutines expressions def CoroutineSuspendExpr : StmtNode; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index a501b901862b6..c55b964650323 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -232,8 +232,7 @@ void threadSafetyCleanup(BeforeSet *Cache); // FIXME: No way to easily map from TemplateTypeParmTypes to // TemplateTypeParmDecls, so we have this horrible PointerUnion. -typedef std::pair, +typedef std::pair, SourceLocation> UnexpandedParameterPack; diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h index 4206bd50b13dd..647c4cfa341e1 100644 --- a/clang/include/clang/Sema/Template.h +++ b/clang/include/clang/Sema/Template.h @@ -365,7 +365,7 @@ enum class TemplateSubstitutionKind : char { class LocalInstantiationScope { public: /// A set of declarations. - using DeclArgumentPack = SmallVector; + using DeclArgumentPack = SmallVector; private: /// Reference to the semantic analysis that is performing diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index ad93d50f6a82b..37cdb0fc9faa8 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1908,7 +1908,6 @@ enum StmtCode { EXPR_PACK_EXPANSION, // PackExpansionExpr EXPR_PACK_INDEXING, // PackIndexingExpr EXPR_SIZEOF_PACK, // SizeOfPackExpr - EXPR_RESOLVED_UNEXPANDED_PACK, // ResolvedUnexpandedPackExpr EXPR_SUBST_NON_TYPE_TEMPLATE_PARM, // SubstNonTypeTemplateParmExpr EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK, // SubstNonTypeTemplateParmPackExpr EXPR_FUNCTION_PARM_PACK, // FunctionParmPackExpr diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index b1b9d56ccca9f..7c70534388b4c 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -948,9 +948,11 @@ void ASTContext::cleanup() { // ASTRecordLayout objects in ASTRecordLayouts must always be destroyed // because they can contain DenseMaps. - for (llvm::DenseMap::iterator - I = ObjCLayouts.begin(), E = ObjCLayouts.end(); I != E; ) + for (llvm::DenseMap::iterator + I = ObjCLayouts.begin(), + E = ObjCLayouts.end(); + I != E;) // Increment in loop to prevent using deallocated memory. if (auto *R = const_cast((I++)->second)) R->Destroy(*this); @@ -3092,13 +3094,7 @@ TypeSourceInfo *ASTContext::getTrivialTypeSourceInfo(QualType T, const ASTRecordLayout & ASTContext::getASTObjCInterfaceLayout(const ObjCInterfaceDecl *D) const { - return getObjCLayout(D, nullptr); -} - -const ASTRecordLayout & -ASTContext::getASTObjCImplementationLayout( - const ObjCImplementationDecl *D) const { - return getObjCLayout(D->getClassInterface(), D); + return getObjCLayout(D); } static auto getCanonicalTemplateArguments(const ASTContext &C, @@ -8916,8 +8912,7 @@ static void EncodeBitField(const ASTContext *Ctx, std::string& S, uint64_t Offset; if (const auto *IVD = dyn_cast(FD)) { - Offset = Ctx->lookupFieldBitOffset(IVD->getContainingInterface(), nullptr, - IVD); + Offset = Ctx->lookupFieldBitOffset(IVD->getContainingInterface(), IVD); } else { const RecordDecl *RD = FD->getParent(); const ASTRecordLayout &RL = Ctx->getASTRecordLayout(RD); @@ -11848,8 +11843,12 @@ bool ASTContext::mergeExtParameterInfo( return true; } -void ASTContext::ResetObjCLayout(const ObjCContainerDecl *CD) { - ObjCLayouts[CD] = nullptr; +void ASTContext::ResetObjCLayout(const ObjCInterfaceDecl *D) { + if (ObjCLayouts.count(D)) { + ObjCLayouts[D] = nullptr; + for (auto *SubClass : ObjCSubClasses[D]) + ResetObjCLayout(SubClass); + } } /// mergeObjCGCQualifiers - This routine merges ObjC's GC attribute of 'LHS' and diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 59c236c9da8c8..503c58a67adeb 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -29,7 +29,7 @@ namespace interp { template class DeclScope final : public LocalScope { public: DeclScope(Compiler *Ctx, const ValueDecl *VD) - : LocalScope(Ctx, VD), Scope(Ctx->P, VD), + : LocalScope(Ctx, VD), Scope(Ctx->P), OldInitializingDecl(Ctx->InitializingDecl) { Ctx->InitializingDecl = VD; Ctx->InitStack.push_back(InitLink::Decl(VD)); @@ -272,7 +272,8 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { CurType = B->getType(); } else { unsigned DerivedOffset = collectBaseOffset(B->getType(), CurType); - if (!this->emitGetPtrBasePop(DerivedOffset, CE)) + if (!this->emitGetPtrBasePop( + DerivedOffset, /*NullOK=*/CE->getType()->isPointerType(), CE)) return false; CurType = B->getType(); } @@ -288,7 +289,8 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { unsigned DerivedOffset = collectBaseOffset(SubExpr->getType(), CE->getType()); - return this->emitGetPtrDerivedPop(DerivedOffset, CE); + return this->emitGetPtrDerivedPop( + DerivedOffset, /*NullOK=*/CE->getType()->isPointerType(), CE); } case CK_FloatingCast: { diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index c80be094856b0..c07690a3d941c 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1063,7 +1063,8 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm, return false; } - if (!Ptr.isRoot() || Ptr.isOnePastEnd() || Ptr.isArrayElement()) { + if (!Ptr.isRoot() || Ptr.isOnePastEnd() || + (Ptr.isArrayElement() && Ptr.getIndex() != 0)) { const SourceInfo &Loc = S.Current->getSource(OpPC); S.FFDiag(Loc, diag::note_constexpr_delete_subobject) << Ptr.toDiagnosticString(S.getASTContext()) << Ptr.isOnePastEnd(); @@ -1432,7 +1433,7 @@ bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func, unsigned Offset = S.getContext().collectBaseOffset( InitialPointeeType->getAsRecordDecl(), OverriderPointeeType->getAsRecordDecl()); - return GetPtrBasePop(S, OpPC, Offset); + return GetPtrBasePop(S, OpPC, Offset, /*IsNullOK=*/true); } return true; diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 73cc107b7dbff..ca74046038072 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1568,10 +1568,20 @@ inline bool GetPtrActiveThisField(InterpState &S, CodePtr OpPC, uint32_t Off) { return true; } -inline bool GetPtrDerivedPop(InterpState &S, CodePtr OpPC, uint32_t Off) { +inline bool GetPtrDerivedPop(InterpState &S, CodePtr OpPC, uint32_t Off, + bool NullOK) { const Pointer &Ptr = S.Stk.pop(); - if (!CheckNull(S, OpPC, Ptr, CSK_Derived)) + if (!NullOK && !CheckNull(S, OpPC, Ptr, CSK_Derived)) return false; + + if (!Ptr.isBlockPointer()) { + // FIXME: We don't have the necessary information in integral pointers. + // The Descriptor only has a record, but that does of course not include + // the potential derived classes of said record. + S.Stk.push(Ptr); + return true; + } + if (!CheckSubobject(S, OpPC, Ptr, CSK_Derived)) return false; if (!CheckDowncast(S, OpPC, Ptr, Off)) @@ -1600,10 +1610,11 @@ inline bool GetPtrBase(InterpState &S, CodePtr OpPC, uint32_t Off) { return true; } -inline bool GetPtrBasePop(InterpState &S, CodePtr OpPC, uint32_t Off) { +inline bool GetPtrBasePop(InterpState &S, CodePtr OpPC, uint32_t Off, + bool NullOK) { const Pointer &Ptr = S.Stk.pop(); - if (!CheckNull(S, OpPC, Ptr, CSK_Base)) + if (!NullOK && !CheckNull(S, OpPC, Ptr, CSK_Base)) return false; if (!Ptr.isBlockPointer()) { @@ -2915,13 +2926,17 @@ inline bool AllocN(InterpState &S, CodePtr OpPC, PrimType T, const Expr *Source, S.Stk.push(0, nullptr); return true; } + assert(NumElements.isPositive()); DynamicAllocator &Allocator = S.getAllocator(); Block *B = Allocator.allocate(Source, T, static_cast(NumElements), S.Ctx.getEvalID(), DynamicAllocator::Form::Array); assert(B); - S.Stk.push(B); + if (NumElements.isZero()) + S.Stk.push(B); + else + S.Stk.push(Pointer(B).atIndex(0)); return true; } @@ -2941,13 +2956,18 @@ inline bool AllocCN(InterpState &S, CodePtr OpPC, const Descriptor *ElementDesc, S.Stk.push(0, ElementDesc); return true; } + assert(NumElements.isPositive()); DynamicAllocator &Allocator = S.getAllocator(); Block *B = Allocator.allocate(ElementDesc, static_cast(NumElements), S.Ctx.getEvalID(), DynamicAllocator::Form::Array); assert(B); - S.Stk.push(B); + if (NumElements.isZero()) + S.Stk.push(B); + else + S.Stk.push(Pointer(B).atIndex(0)); + return true; } diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 088a3e40fe2a7..41e4bae65c195 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -312,7 +312,7 @@ def GetPtrThisField : OffsetOpcode; // [Pointer] -> [Pointer] def GetPtrBase : OffsetOpcode; // [Pointer] -> [Pointer] -def GetPtrBasePop : OffsetOpcode; +def GetPtrBasePop : OffsetOpcode { let Args = [ArgUint32, ArgBool]; } def GetMemberPtrBasePop : Opcode { // Offset of field, which is a base. let Args = [ArgSint32]; @@ -322,9 +322,7 @@ def GetMemberPtrBasePop : Opcode { def FinishInitPop : Opcode; def FinishInit : Opcode; -def GetPtrDerivedPop : Opcode { - let Args = [ArgUint32]; -} +def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool]; } // [Pointer] -> [Pointer] def GetPtrVirtBasePop : Opcode { diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h index c9c3d20f198c6..d503652abb96f 100644 --- a/clang/lib/AST/ByteCode/Program.h +++ b/clang/lib/AST/ByteCode/Program.h @@ -132,20 +132,22 @@ class Program final { /// Context to manage declaration lifetimes. class DeclScope { public: - DeclScope(Program &P, const ValueDecl *VD) : P(P) { - P.startDeclaration(VD); + DeclScope(Program &P) : P(P), PrevDecl(P.CurrentDeclaration) { + ++P.LastDeclaration; + P.CurrentDeclaration = P.LastDeclaration; } - ~DeclScope() { P.endDeclaration(); } + ~DeclScope() { P.CurrentDeclaration = PrevDecl; } private: Program &P; + unsigned PrevDecl; }; /// Returns the current declaration ID. std::optional getCurrentDecl() const { if (CurrentDeclaration == NoDeclaration) - return std::optional{}; - return LastDeclaration; + return std::nullopt; + return CurrentDeclaration; } private: @@ -218,21 +220,12 @@ class Program final { } /// No declaration ID. - static constexpr unsigned NoDeclaration = (unsigned)-1; + static constexpr unsigned NoDeclaration = ~0u; /// Last declaration ID. unsigned LastDeclaration = 0; /// Current declaration ID. unsigned CurrentDeclaration = NoDeclaration; - /// Starts evaluating a declaration. - void startDeclaration(const ValueDecl *Decl) { - LastDeclaration += 1; - CurrentDeclaration = LastDeclaration; - } - - /// Ends a global declaration. - void endDeclaration() { CurrentDeclaration = NoDeclaration; } - public: /// Dumps the disassembled bytecode to \c llvm::errs(). void dump() const; diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 1aa48f0026335..7eff776882629 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -3504,10 +3504,13 @@ VarDecl *BindingDecl::getHoldingVar() const { return VD; } -llvm::ArrayRef BindingDecl::getBindingPackExprs() const { +llvm::ArrayRef BindingDecl::getBindingPackDecls() const { assert(Binding && "expecting a pack expr"); - auto *RP = cast(Binding); - return RP->getExprs(); + auto *FP = cast(Binding); + ValueDecl *const *First = FP->getNumExpansions() > 0 ? FP->begin() : nullptr; + assert((!First || isa(*First)) && "expecting a BindingDecl"); + return llvm::ArrayRef( + reinterpret_cast(First), FP->getNumExpansions()); } void DecompositionDecl::anchor() {} diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 460167c1b9a3d..6f570139630d8 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -3672,7 +3672,6 @@ bool Expr::HasSideEffects(const ASTContext &Ctx, case PackIndexingExprClass: case HLSLOutArgExprClass: case OpenACCAsteriskSizeExprClass: - case ResolvedUnexpandedPackExprClass: // These never have a side-effect. return false; diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index d900af895b42a..c8d61e2cf3f26 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -1779,31 +1779,31 @@ TemplateArgument SubstNonTypeTemplateParmPackExpr::getArgumentPack() const { return TemplateArgument(llvm::ArrayRef(Arguments, NumArguments)); } -FunctionParmPackExpr::FunctionParmPackExpr(QualType T, VarDecl *ParamPack, +FunctionParmPackExpr::FunctionParmPackExpr(QualType T, ValueDecl *ParamPack, SourceLocation NameLoc, unsigned NumParams, - VarDecl *const *Params) + ValueDecl *const *Params) : Expr(FunctionParmPackExprClass, T, VK_LValue, OK_Ordinary), ParamPack(ParamPack), NameLoc(NameLoc), NumParameters(NumParams) { if (Params) std::uninitialized_copy(Params, Params + NumParams, - getTrailingObjects()); + getTrailingObjects()); setDependence(ExprDependence::TypeValueInstantiation | ExprDependence::UnexpandedPack); } FunctionParmPackExpr * FunctionParmPackExpr::Create(const ASTContext &Context, QualType T, - VarDecl *ParamPack, SourceLocation NameLoc, - ArrayRef Params) { - return new (Context.Allocate(totalSizeToAlloc(Params.size()))) + ValueDecl *ParamPack, SourceLocation NameLoc, + ArrayRef Params) { + return new (Context.Allocate(totalSizeToAlloc(Params.size()))) FunctionParmPackExpr(T, ParamPack, NameLoc, Params.size(), Params.data()); } FunctionParmPackExpr * FunctionParmPackExpr::CreateEmpty(const ASTContext &Context, unsigned NumParams) { - return new (Context.Allocate(totalSizeToAlloc(NumParams))) + return new (Context.Allocate(totalSizeToAlloc(NumParams))) FunctionParmPackExpr(QualType(), nullptr, SourceLocation(), 0, nullptr); } @@ -1965,52 +1965,3 @@ CXXFoldExpr::CXXFoldExpr(QualType T, UnresolvedLookupExpr *Callee, SubExprs[SubExpr::RHS] = RHS; setDependence(computeDependence(this)); } - -ResolvedUnexpandedPackExpr::ResolvedUnexpandedPackExpr(SourceLocation BL, - QualType QT, - unsigned NumExprs) - : Expr(ResolvedUnexpandedPackExprClass, QT, VK_PRValue, OK_Ordinary), - BeginLoc(BL), NumExprs(NumExprs) { - // C++ [temp.dep.expr]p3 - // An id-expression is type-dependent if it is - // - associated by name lookup with a pack - setDependence(ExprDependence::TypeValueInstantiation | - ExprDependence::UnexpandedPack); -} - -ResolvedUnexpandedPackExpr * -ResolvedUnexpandedPackExpr::CreateDeserialized(ASTContext &Ctx, - unsigned NumExprs) { - void *Mem = Ctx.Allocate(totalSizeToAlloc(NumExprs), - alignof(ResolvedUnexpandedPackExpr)); - return new (Mem) - ResolvedUnexpandedPackExpr(SourceLocation(), QualType(), NumExprs); -} - -ResolvedUnexpandedPackExpr * -ResolvedUnexpandedPackExpr::Create(ASTContext &Ctx, SourceLocation BL, - QualType T, unsigned NumExprs) { - void *Mem = Ctx.Allocate(totalSizeToAlloc(NumExprs), - alignof(ResolvedUnexpandedPackExpr)); - ResolvedUnexpandedPackExpr *New = - new (Mem) ResolvedUnexpandedPackExpr(BL, T, NumExprs); - - auto Exprs = New->getExprs(); - std::uninitialized_fill(Exprs.begin(), Exprs.end(), nullptr); - - return New; -} - -ResolvedUnexpandedPackExpr * -ResolvedUnexpandedPackExpr::Create(ASTContext &Ctx, SourceLocation BL, - QualType T, ArrayRef Exprs) { - auto *New = Create(Ctx, BL, T, Exprs.size()); - std::uninitialized_copy(Exprs.begin(), Exprs.end(), New->getExprs().begin()); - return New; -} - -ResolvedUnexpandedPackExpr *ResolvedUnexpandedPackExpr::getFromDecl(Decl *D) { - if (auto *BD = dyn_cast(D)) - return dyn_cast_if_present(BD->getBinding()); - return nullptr; -} diff --git a/clang/lib/AST/ExprClassification.cpp b/clang/lib/AST/ExprClassification.cpp index 5225c3ca773ad..3f37d06cc8f3a 100644 --- a/clang/lib/AST/ExprClassification.cpp +++ b/clang/lib/AST/ExprClassification.cpp @@ -451,13 +451,6 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) { case Expr::PackExpansionExprClass: return ClassifyInternal(Ctx, cast(E)->getPattern()); - case Expr::ResolvedUnexpandedPackExprClass: { - if (cast(E)->getNumExprs() > 0) - return ClassifyInternal( - Ctx, cast(E)->getExpansion(0)); - return Cl::CL_LValue; - } - case Expr::MaterializeTemporaryExprClass: return cast(E)->isBoundToLvalueReference() ? Cl::CL_LValue diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 043974fb41443..6ccb6e23f8d2f 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -17253,7 +17253,6 @@ static ICEDiag CheckICE(const Expr* E, const ASTContext &Ctx) { case Expr::SYCLUniqueStableNameExprClass: case Expr::CXXParenListInitExprClass: case Expr::HLSLOutArgExprClass: - case Expr::ResolvedUnexpandedPackExprClass: return ICEDiag(IK_NotICE, E->getBeginLoc()); case Expr::InitListExprClass: { diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index e5eb22eae7dd1..4a090118c3d7b 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -4932,8 +4932,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity, case Expr::AtomicExprClass: case Expr::SourceLocExprClass: case Expr::EmbedExprClass: - case Expr::BuiltinBitCastExprClass: - case Expr::ResolvedUnexpandedPackExprClass: { + case Expr::BuiltinBitCastExprClass: { NotPrimaryExpr(); if (!NullOut) { // As bad as this diagnostic is, it's better than crashing. diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp index ae6d299024c6d..3e38ba0a43d98 100644 --- a/clang/lib/AST/RecordLayoutBuilder.cpp +++ b/clang/lib/AST/RecordLayoutBuilder.cpp @@ -3481,22 +3481,10 @@ uint64_t ASTContext::getFieldOffset(const ValueDecl *VD) const { } uint64_t ASTContext::lookupFieldBitOffset(const ObjCInterfaceDecl *OID, - const ObjCImplementationDecl *ID, const ObjCIvarDecl *Ivar) const { Ivar = Ivar->getCanonicalDecl(); const ObjCInterfaceDecl *Container = Ivar->getContainingInterface(); - - // FIXME: We should eliminate the need to have ObjCImplementationDecl passed - // in here; it should never be necessary because that should be the lexical - // decl context for the ivar. - - // If we know have an implementation (and the ivar is in it) then - // look up in the implementation layout. - const ASTRecordLayout *RL; - if (ID && declaresSameEntity(ID->getClassInterface(), Container)) - RL = &getASTObjCImplementationLayout(ID); - else - RL = &getASTObjCInterfaceLayout(Container); + const ASTRecordLayout *RL = &getASTObjCInterfaceLayout(Container); // Compute field index. // @@ -3522,8 +3510,7 @@ uint64_t ASTContext::lookupFieldBitOffset(const ObjCInterfaceDecl *OID, /// \param Impl - If given, also include the layout of the interface's /// implementation. This may differ by including synthesized ivars. const ASTRecordLayout & -ASTContext::getObjCLayout(const ObjCInterfaceDecl *D, - const ObjCImplementationDecl *Impl) const { +ASTContext::getObjCLayout(const ObjCInterfaceDecl *D) const { // Retrieve the definition if (D->hasExternalLexicalStorage() && !D->getDefinition()) getExternalSource()->CompleteType(const_cast(D)); @@ -3532,22 +3519,9 @@ ASTContext::getObjCLayout(const ObjCInterfaceDecl *D, "Invalid interface decl!"); // Look up this layout, if already laid out, return what we have. - const ObjCContainerDecl *Key = - Impl ? (const ObjCContainerDecl*) Impl : (const ObjCContainerDecl*) D; - if (const ASTRecordLayout *Entry = ObjCLayouts[Key]) + if (const ASTRecordLayout *Entry = ObjCLayouts[D]) return *Entry; - // Add in synthesized ivar count if laying out an implementation. - if (Impl) { - unsigned SynthCount = CountNonClassIvars(D); - // If there aren't any synthesized ivars then reuse the interface - // entry. Note we can't cache this because we simply free all - // entries later; however we shouldn't look up implementations - // frequently. - if (SynthCount == 0) - return getObjCLayout(D, nullptr); - } - ItaniumRecordLayoutBuilder Builder(*this, /*EmptySubobjects=*/nullptr); Builder.Layout(D); @@ -3557,7 +3531,7 @@ ASTContext::getObjCLayout(const ObjCInterfaceDecl *D, /*RequiredAlignment : used by MS-ABI)*/ Builder.Alignment, Builder.getDataSize(), Builder.FieldOffsets); - ObjCLayouts[Key] = NewEntry; + ObjCLayouts[D] = NewEntry; return *NewEntry; } diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index 4b45190fa33ef..c8ea7b52a6241 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -2609,15 +2609,6 @@ void StmtPrinter::VisitPackIndexingExpr(PackIndexingExpr *E) { OS << "]"; } -void StmtPrinter::VisitResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - OS << "<getExprs().begin(), E->getExprs().end(), - [this](auto *X) { PrintExpr(X); }, [this] { OS << ", "; }); - OS << ")>>"; -} - void StmtPrinter::VisitSubstNonTypeTemplateParmPackExpr( SubstNonTypeTemplateParmPackExpr *Node) { OS << *Node->getParameterPack(); diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 77ee6611f623f..2603df25ba2a4 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -2287,10 +2287,6 @@ void StmtProfiler::VisitSizeOfPackExpr(const SizeOfPackExpr *S) { ID.AddInteger(0); } } -void StmtProfiler::VisitResolvedUnexpandedPackExpr( - const ResolvedUnexpandedPackExpr *S) { - VisitExpr(S); -} void StmtProfiler::VisitPackIndexingExpr(const PackIndexingExpr *E) { VisitExpr(E); diff --git a/clang/lib/Analysis/LiveVariables.cpp b/clang/lib/Analysis/LiveVariables.cpp index af563702b77bf..c7d3451d37cf6 100644 --- a/clang/lib/Analysis/LiveVariables.cpp +++ b/clang/lib/Analysis/LiveVariables.cpp @@ -664,18 +664,18 @@ void LiveVariables::dumpExprLiveness(const SourceManager &M) { } void LiveVariablesImpl::dumpExprLiveness(const SourceManager &M) { - auto ByBeginLoc = [&M](const Expr *L, const Expr *R) { - return M.isBeforeInTranslationUnit(L->getBeginLoc(), R->getBeginLoc()); + const ASTContext &Ctx = analysisContext.getASTContext(); + auto ByIDs = [&Ctx](const Expr *L, const Expr *R) { + return L->getID(Ctx) < R->getID(Ctx); }; // Don't iterate over blockEndsToLiveness directly because it's not sorted. for (const CFGBlock *B : *analysisContext.getCFG()) { - llvm::errs() << "\n[ B" << B->getBlockID() << " (live expressions at block exit) ]\n"; std::vector LiveExprs; llvm::append_range(LiveExprs, blocksEndToLiveness[B].liveExprs); - llvm::sort(LiveExprs, ByBeginLoc); + llvm::sort(LiveExprs, ByIDs); for (const Expr *E : LiveExprs) { llvm::errs() << "\n"; E->dump(); diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index b4aa3206fcfab..dff990d15dd62 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -238,6 +238,9 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts, else Builder.defineMacro("__riscv_32e"); } + + if (Opts.CFProtectionReturn && ISAInfo->hasExtension("zicfiss")) + Builder.defineMacro("__riscv_shadow_stack"); } static constexpr int NumRVVBuiltins = diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index e6c2ac939eb88..47bfd470dbafb 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5633,22 +5633,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, if (!CallArgs.getCleanupsToDeactivate().empty()) deactivateArgCleanupsBeforeCall(*this, CallArgs); - // Assert that the arguments we computed match up. The IR verifier - // will catch this, but this is a common enough source of problems - // during IRGen changes that it's way better for debugging to catch - // it ourselves here. -#ifndef NDEBUG - assert(IRCallArgs.size() == IRFuncTy->getNumParams() || IRFuncTy->isVarArg()); - for (unsigned i = 0; i < IRCallArgs.size(); ++i) { - // Inalloca argument can have different type. - if (IRFunctionArgs.hasInallocaArg() && - i == IRFunctionArgs.getInallocaArgNo()) - continue; - if (i < IRFuncTy->getNumParams()) - assert(IRCallArgs[i]->getType() == IRFuncTy->getParamType(i)); - } -#endif - // Update the largest vector width if any arguments have vector types. for (unsigned i = 0; i < IRCallArgs.size(); ++i) LargestVectorWidth = std::max(LargestVectorWidth, diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp index ebd88bb38849e..d1876f47c0eea 100644 --- a/clang/lib/CodeGen/CGObjCGNU.cpp +++ b/clang/lib/CodeGen/CGObjCGNU.cpp @@ -1826,9 +1826,11 @@ class CGObjCGNUstep2 : public CGObjCGNUstep { Context.getASTObjCInterfaceLayout(SuperClassDecl).getSize().getQuantity(); // Instance size is negative for classes that have not yet had their ivar // layout calculated. - classFields.addInt(LongTy, - 0 - (Context.getASTObjCImplementationLayout(OID).getSize().getQuantity() - - superInstanceSize)); + classFields.addInt( + LongTy, 0 - (Context.getASTObjCInterfaceLayout(OID->getClassInterface()) + .getSize() + .getQuantity() - + superInstanceSize)); if (classDecl->all_declared_ivar_begin() == nullptr) classFields.addNullPointer(PtrTy); @@ -3639,8 +3641,9 @@ void CGObjCGNU::GenerateClass(const ObjCImplementationDecl *OID) { } // Get the size of instances. - int instanceSize = - Context.getASTObjCImplementationLayout(OID).getSize().getQuantity(); + int instanceSize = Context.getASTObjCInterfaceLayout(OID->getClassInterface()) + .getSize() + .getQuantity(); // Collect information about instance variables. SmallVector IvarNames; diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 16986de96bdbc..01552b6e53d00 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -3439,8 +3439,9 @@ void CGObjCMac::GenerateClass(const ObjCImplementationDecl *ID) { else if ((hasMRCWeak = hasMRCWeakIvars(CGM, ID))) Flags |= FragileABI_Class_HasMRCWeakIvars; - CharUnits Size = - CGM.getContext().getASTObjCImplementationLayout(ID).getSize(); + CharUnits Size = CGM.getContext() + .getASTObjCInterfaceLayout(ID->getClassInterface()) + .getSize(); // FIXME: Set CXX-structors flag. if (ID->getClassInterface()->getVisibility() == HiddenVisibility) @@ -6330,7 +6331,7 @@ void CGObjCNonFragileABIMac::GetClassSizeInfo(const ObjCImplementationDecl *OID, uint32_t &InstanceStart, uint32_t &InstanceSize) { const ASTRecordLayout &RL = - CGM.getContext().getASTObjCImplementationLayout(OID); + CGM.getContext().getASTObjCInterfaceLayout(OID->getClassInterface()); // InstanceSize is really instance end. InstanceSize = RL.getDataSize().getQuantity(); diff --git a/clang/lib/CodeGen/CGObjCRuntime.cpp b/clang/lib/CodeGen/CGObjCRuntime.cpp index a7f5c913f42fc..dfb0fd14d93ac 100644 --- a/clang/lib/CodeGen/CGObjCRuntime.cpp +++ b/clang/lib/CodeGen/CGObjCRuntime.cpp @@ -31,15 +31,14 @@ using namespace CodeGen; uint64_t CGObjCRuntime::ComputeIvarBaseOffset(CodeGen::CodeGenModule &CGM, const ObjCInterfaceDecl *OID, const ObjCIvarDecl *Ivar) { - return CGM.getContext().lookupFieldBitOffset(OID, nullptr, Ivar) / + return CGM.getContext().lookupFieldBitOffset(OID, Ivar) / CGM.getContext().getCharWidth(); } uint64_t CGObjCRuntime::ComputeIvarBaseOffset(CodeGen::CodeGenModule &CGM, const ObjCImplementationDecl *OID, const ObjCIvarDecl *Ivar) { - return CGM.getContext().lookupFieldBitOffset(OID->getClassInterface(), OID, - Ivar) / + return CGM.getContext().lookupFieldBitOffset(OID->getClassInterface(), Ivar) / CGM.getContext().getCharWidth(); } @@ -47,8 +46,7 @@ unsigned CGObjCRuntime::ComputeBitfieldBitOffset( CodeGen::CodeGenModule &CGM, const ObjCInterfaceDecl *ID, const ObjCIvarDecl *Ivar) { - return CGM.getContext().lookupFieldBitOffset(ID, ID->getImplementation(), - Ivar); + return CGM.getContext().lookupFieldBitOffset(ID, Ivar); } LValue CGObjCRuntime::EmitValueForIvarAtOffset(CodeGen::CodeGenFunction &CGF, @@ -86,7 +84,7 @@ LValue CGObjCRuntime::EmitValueForIvarAtOffset(CodeGen::CodeGenFunction &CGF, // non-synthesized ivars but we may be called for synthesized ivars. However, // a synthesized ivar can never be a bit-field, so this is safe. uint64_t FieldBitOffset = - CGF.CGM.getContext().lookupFieldBitOffset(OID, nullptr, Ivar); + CGF.CGM.getContext().lookupFieldBitOffset(OID, Ivar); uint64_t BitOffset = FieldBitOffset % CGF.CGM.getContext().getCharWidth(); uint64_t AlignmentBits = CGF.CGM.getTarget().getCharAlign(); uint64_t BitFieldSize = Ivar->getBitWidthValue(); diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index b26c5bf1a909e..75f126965e0ac 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1480,11 +1480,15 @@ void Darwin::addProfileRTLibs(const ArgList &Args, // If we have a symbol export directive and we're linking in the profile // runtime, automatically export symbols necessary to implement some of the // runtime's functionality. - if (hasExportSymbolDirective(Args) && ForGCOV) { - addExportedSymbol(CmdArgs, "___gcov_dump"); - addExportedSymbol(CmdArgs, "___gcov_reset"); - addExportedSymbol(CmdArgs, "_writeout_fn_list"); - addExportedSymbol(CmdArgs, "_reset_fn_list"); + if (hasExportSymbolDirective(Args)) { + if (ForGCOV) { + addExportedSymbol(CmdArgs, "___gcov_dump"); + addExportedSymbol(CmdArgs, "___gcov_reset"); + addExportedSymbol(CmdArgs, "_writeout_fn_list"); + addExportedSymbol(CmdArgs, "_reset_fn_list"); + } else { + addExportedSymbol(CmdArgs, "___llvm_write_custom_profile"); + } } // Align __llvm_prf_{cnts,bits,data} sections to the maximum expected page diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 069fd40e2834c..e68daa422b7c4 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -2596,7 +2596,7 @@ class AnnotatingParser { (!NextNonComment && !Line.InMacroBody) || (NextNonComment && (NextNonComment->isPointerOrReference() || - NextNonComment->is(tok::string_literal) || + NextNonComment->isOneOf(TT_ClassHeadName, tok::string_literal) || (Line.InPragmaDirective && NextNonComment->is(tok::identifier))))) { return false; } @@ -6198,8 +6198,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, FormatStyle::PAS_Right && (!Right.Next || Right.Next->isNot(TT_FunctionDeclarationName))); } - if (Right.isOneOf(TT_StartOfName, TT_FunctionDeclarationName) || - Right.is(tok::kw_operator)) { + if (Right.isOneOf(TT_StartOfName, TT_FunctionDeclarationName, + TT_ClassHeadName, tok::kw_operator)) { return true; } if (Left.is(TT_PointerOrReference)) diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 014e629c959e2..b9a5c0589ebc4 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4048,8 +4048,13 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) { StringRef Name = A->getValue(); - if (Name == "full" || Name == "branch") { + if (Name == "full") { + Opts.CFProtectionBranch = 1; + Opts.CFProtectionReturn = 1; + } else if (Name == "branch") { Opts.CFProtectionBranch = 1; + } else if (Name == "return") { + Opts.CFProtectionReturn = 1; } } diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 9dad99ffe9439..355e75d0b2d42 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width)); } +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + uint32_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, !__done)) { + if (!__done) { + uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + uint64_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, __done)) { + if (!__done) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + __gpu_sync_lane(__lane_mask); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + __gpu_sync_lane(__lane_mask); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + // Returns true if the flat pointer points to AMDGPU 'shared' memory. _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)(( diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index 40fa2edebe975..f857a87b5f4c7 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -13,6 +13,10 @@ #error "This file is intended for NVPTX targets or offloading to NVPTX" #endif +#ifndef __CUDA_ARCH__ +#define __CUDA_ARCH__ 0 +#endif + #include #if !defined(__cplusplus) @@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width)); } +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) { + // Newer targets can use the dedicated CUDA support. + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_any_sync_i32(__lane_mask, __x); + + uint32_t __match_mask = 0; + bool __done = 0; + while (__gpu_ballot(__lane_mask, !__done)) { + if (!__done) { + uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + return __match_mask; +} + +// Returns a bitmask marking all lanes that have the same value of __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) { + // Newer targets can use the dedicated CUDA support. + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_any_sync_i64(__lane_mask, __x); + + uint64_t __match_mask = 0; + + bool __done = 0; + while (__gpu_ballot(__lane_mask, __done)) { + if (!__done) { + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + if (__first == __x) { + __match_mask = __gpu_lane_mask(); + __done = 1; + } + } + } + __gpu_sync_lane(__lane_mask); + return __match_mask; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) { + // Newer targets can use the dedicated CUDA support. + int predicate; + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate); + + uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + +// Returns the current lane mask if every lane contains __x. +_DEFAULT_FN_ATTRS static __inline__ uint64_t +__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) { + // Newer targets can use the dedicated CUDA support. + int predicate; + if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700) + return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate); + + uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x); + uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first); + return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull; +} + // Returns true if the flat pointer points to CUDA 'shared' memory. _DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) { return __nvvm_isspacep_shared(ptr); diff --git a/clang/lib/Index/CMakeLists.txt b/clang/lib/Index/CMakeLists.txt index b4e294304f115..f0d2b579c8df6 100644 --- a/clang/lib/Index/CMakeLists.txt +++ b/clang/lib/Index/CMakeLists.txt @@ -23,6 +23,7 @@ add_clang_library(clangIndex clangFormat clangFrontend clangLex + clangSema clangSerialization clangToolingCore diff --git a/clang/lib/Index/IndexBody.cpp b/clang/lib/Index/IndexBody.cpp index f1dc4d5831ce7..5e69987820730 100644 --- a/clang/lib/Index/IndexBody.cpp +++ b/clang/lib/Index/IndexBody.cpp @@ -13,6 +13,7 @@ #include "clang/AST/ExprConcepts.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Type.h" +#include "clang/Sema/HeuristicResolver.h" using namespace clang; using namespace clang::index; @@ -168,51 +169,31 @@ class BodyIndexer : public RecursiveASTVisitor { Parent, ParentDC, Roles, Relations, E); } - bool indexDependentReference( - const Expr *E, const Type *T, const DeclarationNameInfo &NameInfo, - llvm::function_ref Filter) { - if (!T) - return true; - const TemplateSpecializationType *TST = - T->getAs(); - if (!TST) - return true; - TemplateName TN = TST->getTemplateName(); - const ClassTemplateDecl *TD = - dyn_cast_or_null(TN.getAsTemplateDecl()); - if (!TD) - return true; - CXXRecordDecl *RD = TD->getTemplatedDecl(); - if (!RD->hasDefinition()) - return true; - RD = RD->getDefinition(); - std::vector Symbols = - RD->lookupDependentName(NameInfo.getName(), Filter); + bool indexDependentReference(const Expr *E, SourceLocation Loc, + std::vector TargetSymbols) { // FIXME: Improve overload handling. - if (Symbols.size() != 1) + if (TargetSymbols.size() != 1) return true; - SourceLocation Loc = NameInfo.getLoc(); if (Loc.isInvalid()) Loc = E->getBeginLoc(); SmallVector Relations; SymbolRoleSet Roles = getRolesForRef(E, Relations); - return IndexCtx.handleReference(Symbols[0], Loc, Parent, ParentDC, Roles, - Relations, E); + return IndexCtx.handleReference(TargetSymbols[0], Loc, Parent, ParentDC, + Roles, Relations, E); } bool VisitCXXDependentScopeMemberExpr(CXXDependentScopeMemberExpr *E) { - const DeclarationNameInfo &Info = E->getMemberNameInfo(); - return indexDependentReference( - E, E->getBaseType().getTypePtrOrNull(), Info, - [](const NamedDecl *D) { return D->isCXXInstanceMember(); }); + auto *Resolver = IndexCtx.getResolver(); + assert(Resolver); + return indexDependentReference(E, E->getMemberNameInfo().getLoc(), + Resolver->resolveMemberExpr(E)); } bool VisitDependentScopeDeclRefExpr(DependentScopeDeclRefExpr *E) { - const DeclarationNameInfo &Info = E->getNameInfo(); - const NestedNameSpecifier *NNS = E->getQualifier(); - return indexDependentReference( - E, NNS->getAsType(), Info, - [](const NamedDecl *D) { return !D->isCXXInstanceMember(); }); + auto *Resolver = IndexCtx.getResolver(); + assert(Resolver); + return indexDependentReference(E, E->getNameInfo().getLoc(), + Resolver->resolveDeclRefExpr(E)); } bool VisitDesignatedInitExpr(DesignatedInitExpr *E) { diff --git a/clang/lib/Index/IndexingContext.cpp b/clang/lib/Index/IndexingContext.cpp index 2dd68dfcc5a70..bdd6c5acf1d34 100644 --- a/clang/lib/Index/IndexingContext.cpp +++ b/clang/lib/Index/IndexingContext.cpp @@ -14,6 +14,7 @@ #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Index/IndexDataConsumer.h" +#include "clang/Sema/HeuristicResolver.h" using namespace clang; using namespace index; @@ -25,6 +26,17 @@ static bool isGeneratedDecl(const Decl *D) { return false; } +IndexingContext::IndexingContext(IndexingOptions IndexOpts, + IndexDataConsumer &DataConsumer) + : IndexOpts(IndexOpts), DataConsumer(DataConsumer) {} + +IndexingContext::~IndexingContext() = default; + +void IndexingContext::setASTContext(ASTContext &ctx) { + Ctx = &ctx; + Resolver = Ctx ? std::make_unique(*Ctx) : nullptr; +} + bool IndexingContext::shouldIndex(const Decl *D) { return !isGeneratedDecl(D); } diff --git a/clang/lib/Index/IndexingContext.h b/clang/lib/Index/IndexingContext.h index 3020b33bea385..01bfcb9d578bc 100644 --- a/clang/lib/Index/IndexingContext.h +++ b/clang/lib/Index/IndexingContext.h @@ -21,6 +21,7 @@ namespace clang { class Decl; class DeclGroupRef; class ImportDecl; + class HeuristicResolver; class TagDecl; class TypeSourceInfo; class NamedDecl; @@ -39,15 +40,18 @@ class IndexingContext { IndexingOptions IndexOpts; IndexDataConsumer &DataConsumer; ASTContext *Ctx = nullptr; + std::unique_ptr Resolver; public: - IndexingContext(IndexingOptions IndexOpts, IndexDataConsumer &DataConsumer) - : IndexOpts(IndexOpts), DataConsumer(DataConsumer) {} + IndexingContext(IndexingOptions IndexOpts, IndexDataConsumer &DataConsumer); + ~IndexingContext(); const IndexingOptions &getIndexOpts() const { return IndexOpts; } IndexDataConsumer &getDataConsumer() { return DataConsumer; } - void setASTContext(ASTContext &ctx) { Ctx = &ctx; } + void setASTContext(ASTContext &ctx); + + HeuristicResolver *getResolver() const { return Resolver.get(); } bool shouldIndex(const Decl *D); diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index 8963cad86dbca..1f87001f35b57 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -1239,11 +1239,12 @@ static AnalysisResult analyzePathForGSLPointer(const IndirectLocalPath &Path, } // Check the return type, e.g. // const GSLOwner& func(const Foo& foo [[clang::lifetimebound]]) + // GSLOwner* func(cosnt Foo& foo [[clang::lifetimebound]]) // GSLPointer func(const Foo& foo [[clang::lifetimebound]]) if (FD && - ((FD->getReturnType()->isReferenceType() && + ((FD->getReturnType()->isPointerOrReferenceType() && isRecordWithAttr(FD->getReturnType()->getPointeeType())) || - isPointerLikeType(FD->getReturnType()))) + isGLSPointerType(FD->getReturnType()))) return Report; return Abandon; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 0cf02fe6407c2..664d48ccbc382 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -980,24 +980,24 @@ static bool CheckBindingsCount(Sema &S, DecompositionDecl *DD, if (IsValid && HasPack) { // Create the pack expr and assign it to the binding. unsigned PackSize = MemberCount - Bindings.size() + 1; - QualType PackType = S.Context.getPackExpansionType( - S.Context.DependentTy, std::nullopt, /*ExpectsPackInType=*/false); - BindingDecl *BD = (*BindingWithPackItr); - auto *RP = ResolvedUnexpandedPackExpr::Create(S.Context, DD->getBeginLoc(), - DecompType, PackSize); - BD->setDecomposedDecl(DD); - BD->setBinding(PackType, RP); BindingDecl *BPack = *BindingWithPackItr; + BPack->setDecomposedDecl(DD); + SmallVector NestedBDs(PackSize); // Create the nested BindingDecls. - for (Expr *&E : RP->getExprs()) { - auto *NestedBD = BindingDecl::Create(S.Context, BPack->getDeclContext(), - BPack->getLocation(), - BPack->getIdentifier(), QualType()); + for (unsigned I = 0; I < PackSize; ++I) { + BindingDecl *NestedBD = BindingDecl::Create( + S.Context, BPack->getDeclContext(), BPack->getLocation(), + BPack->getIdentifier(), QualType()); NestedBD->setDecomposedDecl(DD); - E = S.BuildDeclRefExpr(NestedBD, S.Context.DependentTy, VK_LValue, - BPack->getLocation()); + NestedBDs[I] = NestedBD; } + + QualType PackType = S.Context.getPackExpansionType( + S.Context.DependentTy, PackSize, /*ExpectsPackInType=*/false); + auto *PackExpr = FunctionParmPackExpr::Create( + S.Context, PackType, BPack, BPack->getBeginLoc(), NestedBDs); + BPack->setBinding(PackType, PackExpr); } if (IsValid) diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp index e665d0293dc84..ba9d3dcf19617 100644 --- a/clang/lib/Sema/SemaDeclObjC.cpp +++ b/clang/lib/Sema/SemaDeclObjC.cpp @@ -659,6 +659,7 @@ void SemaObjC::ActOnSuperClassOfClassInterface( IDecl->setSuperClass(SuperClassTInfo); IDecl->setEndOfDefinitionLoc(SuperClassTInfo->getTypeLoc().getEndLoc()); + getASTContext().addObjCSubClass(IDecl->getSuperClass(), IDecl); } } @@ -2129,6 +2130,12 @@ SemaObjC::ActOnFinishObjCImplementation(Decl *ObjCImpDecl, DeclsInGroup.push_back(ObjCImpDecl); + // Reset the cached layout if there are any ivars added to + // the implementation. + if (auto *ImplD = dyn_cast(ObjCImpDecl)) + if (!ImplD->ivar_empty()) + getASTContext().ResetObjCLayout(ImplD->getClassInterface()); + return SemaRef.BuildDeclaratorGroup(DeclsInGroup); } diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index 8c8ba1da88ebf..a8eb24133a76d 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1286,7 +1286,6 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Expr::ConvertVectorExprClass: case Expr::VAArgExprClass: case Expr::CXXParenListInitExprClass: - case Expr::ResolvedUnexpandedPackExprClass: return canSubStmtsThrow(*this, S); case Expr::CompoundLiteralExprClass: diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 5817632b61dbd..fad15bf95c415 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -15949,7 +15949,7 @@ ExprResult Sema::ActOnStmtExprResult(ExprResult ER) { // FIXME: Provide a better location for the initialization. return PerformCopyInitialization( InitializedEntity::InitializeStmtExprResult( - E->getBeginLoc(), E->getType().getUnqualifiedType()), + E->getBeginLoc(), E->getType().getAtomicUnqualifiedType()), SourceLocation(), E); } @@ -19430,7 +19430,7 @@ static ExprResult rebuildPotentialResultsAsNonOdrUsed(Sema &S, Expr *E, auto *FPPE = cast(E); // If any of the declarations in the pack is odr-used, then the expression // as a whole constitutes an odr-use. - for (VarDecl *D : *FPPE) + for (ValueDecl *D : *FPPE) if (IsPotentialResultOdrUsed(D)) return ExprEmpty(); @@ -19705,7 +19705,7 @@ void Sema::CleanupVarDeclMarking() { MarkVarDeclODRUsed(cast(ME->getMemberDecl()), ME->getMemberLoc(), *this); } else if (auto *FP = dyn_cast(E)) { - for (VarDecl *VD : *FP) + for (ValueDecl *VD : *FP) MarkVarDeclODRUsed(VD, FP->getParameterPackLocation(), *this); } else { llvm_unreachable("Unexpected expression"); @@ -20081,7 +20081,7 @@ void Sema::MarkMemberReferenced(MemberExpr *E) { } void Sema::MarkFunctionParmPackReferenced(FunctionParmPackExpr *E) { - for (VarDecl *VD : *E) + for (ValueDecl *VD : *E) MarkExprReferenced(*this, E->getParameterPackLocation(), VD, E, true, RefsMinusAssignments); } diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index d1a45af6ca58f..121da4916ed43 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1585,20 +1585,16 @@ namespace { SubstNonTypeTemplateParmExpr *E); /// Rebuild a DeclRefExpr for a VarDecl reference. - ExprResult RebuildVarDeclRefExpr(VarDecl *PD, SourceLocation Loc); + ExprResult RebuildVarDeclRefExpr(ValueDecl *PD, SourceLocation Loc); /// Transform a reference to a function or init-capture parameter pack. - ExprResult TransformFunctionParmPackRefExpr(DeclRefExpr *E, VarDecl *PD); + ExprResult TransformFunctionParmPackRefExpr(DeclRefExpr *E, ValueDecl *PD); /// Transform a FunctionParmPackExpr which was built when we couldn't /// expand a function parameter pack reference which refers to an expanded /// pack. ExprResult TransformFunctionParmPackExpr(FunctionParmPackExpr *E); - // Transform a ResolvedUnexpandedPackExpr - ExprResult - TransformResolvedUnexpandedPackExpr(ResolvedUnexpandedPackExpr *E); - QualType TransformFunctionProtoType(TypeLocBuilder &TLB, FunctionProtoTypeLoc TL) { // Call the base version; it will forward to our overridden version below. @@ -2392,7 +2388,7 @@ TemplateInstantiator::TransformSubstNonTypeTemplateParmExpr( SugaredConverted, E->getPackIndex()); } -ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(VarDecl *PD, +ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(ValueDecl *PD, SourceLocation Loc) { DeclarationNameInfo NameInfo(PD->getDeclName(), Loc); return getSema().BuildDeclarationNameExpr(CXXScopeSpec(), NameInfo, PD); @@ -2402,8 +2398,8 @@ ExprResult TemplateInstantiator::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { if (getSema().ArgumentPackSubstitutionIndex != -1) { // We can expand this parameter pack now. - VarDecl *D = E->getExpansion(getSema().ArgumentPackSubstitutionIndex); - VarDecl *VD = cast_or_null(TransformDecl(E->getExprLoc(), D)); + ValueDecl *D = E->getExpansion(getSema().ArgumentPackSubstitutionIndex); + ValueDecl *VD = cast_or_null(TransformDecl(E->getExprLoc(), D)); if (!VD) return ExprError(); return RebuildVarDeclRefExpr(VD, E->getExprLoc()); @@ -2415,11 +2411,11 @@ TemplateInstantiator::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { // Transform each of the parameter expansions into the corresponding // parameters in the instantiation of the function decl. - SmallVector Vars; + SmallVector Vars; Vars.reserve(E->getNumExpansions()); for (FunctionParmPackExpr::iterator I = E->begin(), End = E->end(); I != End; ++I) { - VarDecl *D = cast_or_null(TransformDecl(E->getExprLoc(), *I)); + ValueDecl *D = cast_or_null(TransformDecl(E->getExprLoc(), *I)); if (!D) return ExprError(); Vars.push_back(D); @@ -2434,7 +2430,7 @@ TemplateInstantiator::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { ExprResult TemplateInstantiator::TransformFunctionParmPackRefExpr(DeclRefExpr *E, - VarDecl *PD) { + ValueDecl *PD) { typedef LocalInstantiationScope::DeclArgumentPack DeclArgumentPack; llvm::PointerUnion *Found = getSema().CurrentInstantiationScope->findInstantiationOf(PD); @@ -2460,7 +2456,8 @@ TemplateInstantiator::TransformFunctionParmPackRefExpr(DeclRefExpr *E, } // We have either an unexpanded pack or a specific expansion. - return RebuildVarDeclRefExpr(cast(TransformedDecl), E->getExprLoc()); + return RebuildVarDeclRefExpr(cast(TransformedDecl), + E->getExprLoc()); } ExprResult @@ -2482,15 +2479,6 @@ TemplateInstantiator::TransformDeclRefExpr(DeclRefExpr *E) { if (PD->isParameterPack()) return TransformFunctionParmPackRefExpr(E, PD); - if (BindingDecl *BD = dyn_cast(D); BD && BD->isParameterPack()) { - BD = cast_or_null(TransformDecl(BD->getLocation(), BD)); - if (!BD) - return ExprError(); - if (auto *RP = - dyn_cast_if_present(BD->getBinding())) - return TransformResolvedUnexpandedPackExpr(RP); - } - return inherited::TransformDeclRefExpr(E); } @@ -2651,19 +2639,6 @@ TemplateInstantiator::TransformTemplateTypeParmType(TypeLocBuilder &TLB, return Result; } -ExprResult TemplateInstantiator::TransformResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - if (getSema().ArgumentPackSubstitutionIndex != -1) { - assert(static_cast(getSema().ArgumentPackSubstitutionIndex) < - E->getNumExprs() && - "ArgumentPackSubstitutionIndex is out of range"); - return TransformExpr( - E->getExpansion(getSema().ArgumentPackSubstitutionIndex)); - } - - return inherited::TransformResolvedUnexpandedPackExpr(E); -} - QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType( TypeLocBuilder &TLB, SubstTemplateTypeParmPackTypeLoc TL, bool SuppressObjCLifetime) { @@ -4680,7 +4655,7 @@ void LocalInstantiationScope::InstantiatedLocal(const Decl *D, Decl *Inst) { #endif Stored = Inst; } else if (DeclArgumentPack *Pack = dyn_cast(Stored)) { - Pack->push_back(cast(Inst)); + Pack->push_back(cast(Inst)); } else { assert(cast(Stored) == Inst && "Already instantiated this local"); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 1f42f9500959e..1cdf80898bfca 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -1179,13 +1179,13 @@ Decl *TemplateDeclInstantiator::VisitDecompositionDecl(DecompositionDecl *D) { // Transform the bindings first. // The transformed DD will have all of the concrete BindingDecls. SmallVector NewBindings; - ResolvedUnexpandedPackExpr *OldResolvedPack = nullptr; + BindingDecl *OldBindingPack = nullptr; for (auto *OldBD : D->bindings()) { Expr *BindingExpr = OldBD->getBinding(); - if (auto *RP = - dyn_cast_if_present(BindingExpr)) { - assert(!OldResolvedPack && "no more than one pack is allowed"); - OldResolvedPack = RP; + if (isa_and_present(BindingExpr)) { + // We have a resolved pack. + assert(!OldBindingPack && "no more than one pack is allowed"); + OldBindingPack = OldBD; } NewBindings.push_back(cast(VisitBindingDecl(OldBD))); } @@ -1198,25 +1198,20 @@ Decl *TemplateDeclInstantiator::VisitDecompositionDecl(DecompositionDecl *D) { for (auto *NewBD : NewBindings) NewBD->setInvalidDecl(); - if (OldResolvedPack) { - // Mark the holding vars (if any) in the pack as instantiated since - // they are created implicitly. + if (OldBindingPack) { + // Mark the bindings in the pack as instantiated. auto Bindings = NewDD->bindings(); - auto BPack = llvm::find_if( + BindingDecl *NewBindingPack = *llvm::find_if( Bindings, [](BindingDecl *D) -> bool { return D->isParameterPack(); }); - auto *NewResolvedPack = - cast((*BPack)->getBinding()); - auto OldExprs = OldResolvedPack->getExprs(); - auto NewExprs = NewResolvedPack->getExprs(); - assert(OldExprs.size() == NewExprs.size()); - for (unsigned I = 0; I < OldResolvedPack->getNumExprs(); I++) { - DeclRefExpr *OldDRE = cast(OldExprs[I]); - BindingDecl *OldNestedBD = cast(OldDRE->getDecl()); - DeclRefExpr *NewDRE = cast(NewExprs[I]); - BindingDecl *NewNestedBD = cast(NewDRE->getDecl()); - SemaRef.CurrentInstantiationScope->InstantiatedLocal(OldNestedBD, - NewNestedBD); - } + assert(NewBindingPack != nullptr && "new bindings should also have a pack"); + llvm::ArrayRef OldDecls = + OldBindingPack->getBindingPackDecls(); + llvm::ArrayRef NewDecls = + NewBindingPack->getBindingPackDecls(); + assert(OldDecls.size() == NewDecls.size()); + for (unsigned I = 0; I < OldDecls.size(); I++) + SemaRef.CurrentInstantiationScope->InstantiatedLocal(OldDecls[I], + NewDecls[I]); } return NewDD; @@ -6280,9 +6275,7 @@ NamedDecl *Sema::FindInstantiatedDecl(SourceLocation Loc, NamedDecl *D, if (auto *BD = dyn_cast(FD); BD && BD->isParameterPack() && ArgumentPackSubstitutionIndex != -1) { - auto *DRE = cast( - BD->getBindingPackExprs()[ArgumentPackSubstitutionIndex]); - return cast(DRE->getDecl()); + return BD->getBindingPackDecls()[ArgumentPackSubstitutionIndex]; } return cast(FD); } diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 3c56794722dcc..fad00f7648848 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -50,13 +50,8 @@ class CollectUnexpandedParameterPacksVisitor auto *FTD = FD ? FD->getDescribedFunctionTemplate() : nullptr; if (FTD && FTD->getTemplateParameters()->getDepth() >= DepthLimit) return; - } else if (auto *BD = dyn_cast(ND)) { - Expr *E = BD->getBinding(); - if (auto *RP = cast_if_present(E)) { - addUnexpanded(RP); - return; - } - } else if (getDepthAndIndex(ND).first >= DepthLimit) { + } else if (ND->isTemplateParameterPack() && + getDepthAndIndex(ND).first >= DepthLimit) { return; } @@ -69,10 +64,6 @@ class CollectUnexpandedParameterPacksVisitor Unexpanded.push_back({T, Loc}); } - void addUnexpanded(ResolvedUnexpandedPackExpr *E) { - Unexpanded.push_back({E, E->getBeginLoc()}); - } - public: explicit CollectUnexpandedParameterPacksVisitor( SmallVectorImpl &Unexpanded) @@ -115,12 +106,6 @@ class CollectUnexpandedParameterPacksVisitor return true; } - bool - VisitResolvedUnexpandedPackExpr(ResolvedUnexpandedPackExpr *E) override { - addUnexpanded(E); - return true; - } - /// Record occurrences of template template parameter packs. bool TraverseTemplateName(TemplateName Template) override { if (auto *TTP = dyn_cast_or_null( @@ -782,16 +767,13 @@ bool Sema::CheckParameterPacksForExpansion( unsigned Depth = 0, Index = 0; IdentifierInfo *Name; bool IsVarDeclPack = false; - ResolvedUnexpandedPackExpr *ResolvedPack = nullptr; + FunctionParmPackExpr *BindingPack = nullptr; if (const TemplateTypeParmType *TTP = ParmPack.first.dyn_cast()) { Depth = TTP->getDepth(); Index = TTP->getIndex(); Name = TTP->getIdentifier(); - } else if (auto *RP = - ParmPack.first.dyn_cast()) { - ResolvedPack = RP; } else { NamedDecl *ND = cast(ParmPack.first); if (isa(ND)) @@ -802,8 +784,8 @@ bool Sema::CheckParameterPacksForExpansion( CurrentInstantiationScope->findInstantiationOf(ND); Decl *B = cast(*Instantiation); Expr *BindingExpr = cast(B)->getBinding(); - ResolvedPack = cast_if_present(BindingExpr); - if (!ResolvedPack) { + BindingPack = cast_if_present(BindingExpr); + if (!BindingPack) { ShouldExpand = false; continue; } @@ -829,8 +811,8 @@ bool Sema::CheckParameterPacksForExpansion( ShouldExpand = false; continue; } - } else if (ResolvedPack) { - NewPackSize = ResolvedPack->getNumExprs(); + } else if (BindingPack) { + NewPackSize = BindingPack->getNumExpansions(); } else { // If we don't have a template argument at this depth/index, then we // cannot expand the pack expansion. Make a note of this, but we still @@ -867,7 +849,7 @@ bool Sema::CheckParameterPacksForExpansion( // Template argument deduction can extend the sequence of template // arguments corresponding to a template parameter pack, even when the // sequence contains explicitly specified template arguments. - if (!IsVarDeclPack && !ResolvedPack && CurrentInstantiationScope) { + if (!IsVarDeclPack && CurrentInstantiationScope) { if (NamedDecl *PartialPack = CurrentInstantiationScope->getPartiallySubstitutedPack()) { unsigned PartialDepth, PartialIndex; @@ -973,12 +955,6 @@ std::optional Sema::getNumArgumentsInExpansionFromUnexpanded( Unexpanded[I].first.dyn_cast()) { Depth = TTP->getDepth(); Index = TTP->getIndex(); - } else if (auto *PE = Unexpanded[I] - .first.dyn_cast()) { - unsigned Size = PE->getNumExprs(); - assert((!Result || *Result == Size) && "inconsistent pack sizes"); - Result = Size; - continue; } else { NamedDecl *ND = cast(Unexpanded[I].first); if (isa(ND)) { @@ -1207,12 +1183,8 @@ ExprResult Sema::ActOnSizeofParameterPackExpr(Scope *S, MarkAnyDeclReferenced(OpLoc, ParameterPack, true); - std::optional Length; - if (auto *RP = ResolvedUnexpandedPackExpr::getFromDecl(ParameterPack)) - Length = RP->getNumExprs(); - return SizeOfPackExpr::Create(Context, OpLoc, ParameterPack, NameLoc, - RParenLoc, Length); + RParenLoc); } static bool isParameterPack(Expr *PackExpression) { @@ -1360,7 +1332,7 @@ std::optional Sema::getFullyPackExpandedSize(TemplateArgument Arg) { dyn_cast(Arg.getAsExpr())) Pack = Subst->getArgumentPack(); else if (auto *Subst = dyn_cast(Arg.getAsExpr())) { - for (VarDecl *PD : *Subst) + for (ValueDecl *PD : *Subst) if (PD->isParameterPack()) return std::nullopt; return Subst->getNumExpansions(); diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 73e979927b4f3..05cac8db3c42c 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -3680,13 +3680,6 @@ class TreeTransform { FullySubstituted); } - ExprResult RebuildResolvedUnexpandedPackExpr(SourceLocation BeginLoc, - QualType T, - ArrayRef Exprs) { - return ResolvedUnexpandedPackExpr::Create(SemaRef.Context, BeginLoc, T, - Exprs); - } - /// Build a new expression representing a call to a source location /// builtin. /// @@ -16183,24 +16176,6 @@ TreeTransform::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { return E; } -template -ExprResult TreeTransform::TransformResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - bool ArgumentChanged = false; - SmallVector NewExprs; - if (TransformExprs(E->getExprs().begin(), E->getNumExprs(), - /*IsCall=*/false, NewExprs, &ArgumentChanged)) - return ExprError(); - - if (!AlwaysRebuild() && !ArgumentChanged) - return E; - - // NOTE: The type is just a superficial PackExpansionType - // that needs no substitution. - return RebuildResolvedUnexpandedPackExpr(E->getBeginLoc(), E->getType(), - NewExprs); -} - template ExprResult TreeTransform::TransformMaterializeTemporaryExpr( diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index a89eee601e437..fba54023a6bb2 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2208,16 +2208,6 @@ void ASTStmtReader::VisitPackIndexingExpr(PackIndexingExpr *E) { Exprs[I] = Record.readExpr(); } -void ASTStmtReader::VisitResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - VisitExpr(E); - E->NumExprs = Record.readInt(); - E->BeginLoc = readSourceLocation(); - auto **Exprs = E->getTrailingObjects(); - for (unsigned I = 0; I < E->NumExprs; ++I) - Exprs[I] = Record.readExpr(); -} - void ASTStmtReader::VisitSubstNonTypeTemplateParmExpr( SubstNonTypeTemplateParmExpr *E) { VisitExpr(E); @@ -2249,11 +2239,11 @@ void ASTStmtReader::VisitSubstNonTypeTemplateParmPackExpr( void ASTStmtReader::VisitFunctionParmPackExpr(FunctionParmPackExpr *E) { VisitExpr(E); E->NumParameters = Record.readInt(); - E->ParamPack = readDeclAs(); + E->ParamPack = readDeclAs(); E->NameLoc = readSourceLocation(); - auto **Parms = E->getTrailingObjects(); + auto **Parms = E->getTrailingObjects(); for (unsigned i = 0, n = E->NumParameters; i != n; ++i) - Parms[i] = readDeclAs(); + Parms[i] = readDeclAs(); } void ASTStmtReader::VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E) { @@ -4321,12 +4311,6 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { /*TransformedExprs=*/Record[ASTStmtReader::NumExprFields]); break; - case EXPR_RESOLVED_UNEXPANDED_PACK: - S = ResolvedUnexpandedPackExpr::CreateDeserialized( - Context, - /*NumExprs=*/Record[ASTStmtReader::NumExprFields]); - break; - case EXPR_SUBST_NON_TYPE_TEMPLATE_PARM: S = new (Context) SubstNonTypeTemplateParmExpr(Empty); break; diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 64791300fe722..79b777cddd0b0 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -874,7 +874,6 @@ static void AddStmtsExprs(llvm::BitstreamWriter &Stream, RECORD(EXPR_PACK_EXPANSION); RECORD(EXPR_SIZEOF_PACK); RECORD(EXPR_PACK_INDEXING); - RECORD(EXPR_RESOLVED_UNEXPANDED_PACK); RECORD(EXPR_SUBST_NON_TYPE_TEMPLATE_PARM); RECORD(EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK); RECORD(EXPR_FUNCTION_PARM_PACK); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 6a779f1618287..2687231d7820f 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2210,16 +2210,6 @@ void ASTStmtWriter::VisitPackIndexingExpr(PackIndexingExpr *E) { Code = serialization::EXPR_PACK_INDEXING; } -void ASTStmtWriter::VisitResolvedUnexpandedPackExpr( - ResolvedUnexpandedPackExpr *E) { - VisitExpr(E); - Record.push_back(E->getNumExprs()); - Record.AddSourceLocation(E->getBeginLoc()); - for (Expr *Sub : E->getExprs()) - Record.AddStmt(Sub); - Code = serialization::EXPR_RESOLVED_UNEXPANDED_PACK; -} - void ASTStmtWriter::VisitSubstNonTypeTemplateParmExpr( SubstNonTypeTemplateParmExpr *E) { VisitExpr(E); diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp index f56e9192d1d66..954b4763034e7 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundChecker.cpp @@ -34,24 +34,37 @@ using namespace taint; using llvm::formatv; namespace { -/// If `E` is a "clean" array subscript expression, return the type of the -/// accessed element. If the base of the subscript expression is modified by -/// pointer arithmetic (and not the beginning of a "full" memory region), this -/// always returns nullopt because that's the right (or the least bad) thing to -/// do for the diagnostic output that's relying on this. -static std::optional determineElementType(const Expr *E, - const CheckerContext &C) { +/// If `E` is an array subscript expression with a base that is "clean" (= not +/// modified by pointer arithmetic = the beginning of a memory region), return +/// it as a pointer to ArraySubscriptExpr; otherwise return nullptr. +/// This helper function is used by two separate heuristics that are only valid +/// in these "clean" cases. +static const ArraySubscriptExpr * +getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) { const auto *ASE = dyn_cast(E); if (!ASE) - return std::nullopt; + return nullptr; const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion(); if (!SubscriptBaseReg) - return std::nullopt; + return nullptr; // The base of the subscript expression is affected by pointer arithmetics, - // so we want to report byte offsets instead of indices. + // so we want to report byte offsets instead of indices and we don't want to + // activate the "index is unsigned -> cannot be negative" shortcut. if (isa(SubscriptBaseReg->StripCasts())) + return nullptr; + + return ASE; +} + +/// If `E` is a "clean" array subscript expression, return the type of the +/// accessed element; otherwise return std::nullopt because that's the best (or +/// least bad) option for the diagnostic generation that relies on this. +static std::optional determineElementType(const Expr *E, + const CheckerContext &C) { + const auto *ASE = getAsCleanArraySubscriptExpr(E, C); + if (!ASE) return std::nullopt; return ASE->getType(); @@ -140,7 +153,9 @@ class ArrayBoundChecker : public Checker, ProgramStateRef ErrorState, NonLoc Val, bool MarkTaint); - static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC); + static bool isFromCtypeMacro(const Expr *E, ASTContext &AC); + + static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C); static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State, NonLoc Offset, NonLoc Limit, @@ -587,20 +602,48 @@ void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const { State, ByteOffset, SVB.makeZeroArrayIndex(), SVB); if (PrecedesLowerBound) { - // The offset may be invalid (negative)... - if (!WithinLowerBound) { - // ...and it cannot be valid (>= 0), so report an error. - Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); - reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt); - return; + // The analyzer thinks that the offset may be invalid (negative)... + + if (isOffsetObviouslyNonnegative(E, C)) { + // ...but the offset is obviously non-negative (clear array subscript + // with an unsigned index), so we're in a buggy situation. + + // TODO: Currently the analyzer ignores many casts (e.g. signed -> + // unsigned casts), so it can easily reach states where it will load a + // signed (and negative) value from an unsigned variable. This sanity + // check is a duct tape "solution" that silences most of the ugly false + // positives that are caused by this buggy behavior. Note that this is + // not a complete solution: this cannot silence reports where pointer + // arithmetic complicates the picture and cannot ensure modeling of the + // "unsigned index is positive with highest bit set" cases which are + // "usurped" by the nonsense "unsigned index is negative" case. + // For more information about this topic, see the umbrella ticket + // https://github.com/llvm/llvm-project/issues/39492 + // TODO: Remove this hack once 'SymbolCast's are modeled properly. + + if (!WithinLowerBound) { + // The state is completely nonsense -- let's just sink it! + C.addSink(); + return; + } + // Otherwise continue on the 'WithinLowerBound' branch where the + // unsigned index _is_ non-negative. Don't mention this assumption as a + // note tag, because it would just confuse the users! + } else { + if (!WithinLowerBound) { + // ...and it cannot be valid (>= 0), so report an error. + Messages Msgs = getPrecedesMsgs(Reg, ByteOffset); + reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt); + return; + } + // ...but it can be valid as well, so the checker will (optimistically) + // assume that it's valid and mention this in the note tag. + SUR.recordNonNegativeAssumption(); } - // ...but it can be valid as well, so the checker will (optimistically) - // assume that it's valid and mention this in the note tag. - SUR.recordNonNegativeAssumption(); } // Actually update the state. The "if" only fails in the extremely unlikely - // case when compareValueToThreshold returns {nullptr, nullptr} becasue + // case when compareValueToThreshold returns {nullptr, nullptr} because // evalBinOpNN fails to evaluate the less-than operator. if (WithinLowerBound) State = WithinLowerBound; @@ -660,7 +703,7 @@ void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const { } // Actually update the state. The "if" only fails in the extremely unlikely - // case when compareValueToThreshold returns {nullptr, nullptr} becasue + // case when compareValueToThreshold returns {nullptr, nullptr} because // evalBinOpNN fails to evaluate the less-than operator. if (WithinUpperBound) State = WithinUpperBound; @@ -725,8 +768,8 @@ void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState, C.emitReport(std::move(BR)); } -bool ArrayBoundChecker::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) { - SourceLocation Loc = S->getBeginLoc(); +bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) { + SourceLocation Loc = E->getBeginLoc(); if (!Loc.isMacroID()) return false; @@ -744,6 +787,14 @@ bool ArrayBoundChecker::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) { (MacroName == "isupper") || (MacroName == "isxdigit")); } +bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E, + CheckerContext &C) { + const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C); + if (!ASE) + return false; + return ASE->getIdx()->getType()->isUnsignedIntegerOrEnumerationType(); +} + bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) { ParentMapContext &ParentCtx = ACtx.getParentMapContext(); do { diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp index 7460781799d08..bf35bee70870b 100644 --- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp @@ -145,6 +145,57 @@ using MutexDescriptor = std::variant; +class SuppressNonBlockingStreams : public BugReporterVisitor { +private: + const CallDescription OpenFunction{CDM::CLibrary, {"open"}, 2}; + SymbolRef StreamSym; + const int NonBlockMacroVal; + bool Satisfied = false; + +public: + SuppressNonBlockingStreams(SymbolRef StreamSym, int NonBlockMacroVal) + : StreamSym(StreamSym), NonBlockMacroVal(NonBlockMacroVal) {} + + static void *getTag() { + static bool Tag; + return &Tag; + } + + void Profile(llvm::FoldingSetNodeID &ID) const override { + ID.AddPointer(getTag()); + } + + PathDiagnosticPieceRef VisitNode(const ExplodedNode *N, + BugReporterContext &BRC, + PathSensitiveBugReport &BR) override { + if (Satisfied) + return nullptr; + + std::optional Point = N->getLocationAs(); + if (!Point) + return nullptr; + + const auto *CE = Point->getStmtAs(); + if (!CE || !OpenFunction.matchesAsWritten(*CE)) + return nullptr; + + if (N->getSVal(CE).getAsSymbol() != StreamSym) + return nullptr; + + Satisfied = true; + + // Check if open's second argument contains O_NONBLOCK + const llvm::APSInt *FlagVal = N->getSVal(CE->getArg(1)).getAsInteger(); + if (!FlagVal) + return nullptr; + + if ((*FlagVal & NonBlockMacroVal) != 0) + BR.markInvalid(getTag(), nullptr); + + return nullptr; + } +}; + class BlockInCriticalSectionChecker : public Checker { private: const std::array MutexDescriptors{ @@ -182,6 +233,9 @@ class BlockInCriticalSectionChecker : public Checker { const BugType BlockInCritSectionBugType{ this, "Call to blocking function in critical section", "Blocking Error"}; + using O_NONBLOCKValueTy = std::optional; + mutable std::optional O_NONBLOCKValue; + void reportBlockInCritSection(const CallEvent &call, CheckerContext &C) const; [[nodiscard]] const NoteTag *createCritSectionNote(CritSectionMarker M, @@ -337,6 +391,28 @@ void BlockInCriticalSectionChecker::reportBlockInCritSection( << "' inside of critical section"; auto R = std::make_unique(BlockInCritSectionBugType, os.str(), ErrNode); + // for 'read' and 'recv' call, check whether it's file descriptor(first + // argument) is + // created by 'open' API with O_NONBLOCK flag or is equal to -1, they will + // not cause block in these situations, don't report + StringRef FuncName = Call.getCalleeIdentifier()->getName(); + if (FuncName == "read" || FuncName == "recv") { + SVal SV = Call.getArgSVal(0); + SValBuilder &SVB = C.getSValBuilder(); + ProgramStateRef state = C.getState(); + ConditionTruthVal CTV = + state->areEqual(SV, SVB.makeIntVal(-1, C.getASTContext().IntTy)); + if (CTV.isConstrainedTrue()) + return; + + if (SymbolRef SR = SV.getAsSymbol()) { + if (!O_NONBLOCKValue) + O_NONBLOCKValue = tryExpandAsInteger( + "O_NONBLOCK", C.getBugReporter().getPreprocessor()); + if (*O_NONBLOCKValue) + R->addVisitor(SR, **O_NONBLOCKValue); + } + } R->addRange(Call.getSourceRange()); R->markInteresting(Call.getReturnValue()); C.emitReport(std::move(R)); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index d93952264a606..c3dcdc985a935 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1745,7 +1745,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::DependentCoawaitExprClass: case Stmt::CoreturnStmtClass: case Stmt::CoyieldExprClass: - case Stmt::ResolvedUnexpandedPackExprClass: case Stmt::SEHTryStmtClass: case Stmt::SEHExceptStmtClass: case Stmt::SEHLeaveStmtClass: diff --git a/clang/test/AST/ByteCode/libcxx/global-decl-id.cpp b/clang/test/AST/ByteCode/libcxx/global-decl-id.cpp new file mode 100644 index 0000000000000..0dd583c3d467f --- /dev/null +++ b/clang/test/AST/ByteCode/libcxx/global-decl-id.cpp @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -std=c++2c -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -std=c++2c -verify=ref,both %s + +// both-no-diagnostics + +namespace std { +constexpr int +midpoint(int __a, int ) { + constexpr unsigned __half_diff = 0; + return __half_diff; +} +} +struct Tuple { + int min; + int mid; + constexpr Tuple() { + min = 0; + mid = std::midpoint(min, min); + } +}; +constexpr Tuple tup; + diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index e9850d27666e5..7e5f6ab8815ea 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -922,6 +922,20 @@ namespace NonConstexprArrayCtor { // both-note {{in call to}} } +namespace ArrayBaseCast { + struct A {}; + struct B : A {}; + constexpr bool test() { + B *b = new B[2]; + + A* a = b; + + delete[] b; + return true; + } + static_assert(test()); +} + #else /// Make sure we reject this prior to C++20 constexpr int a() { // both-error {{never produces a constant expression}} diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp index 9470e7d8e3dcb..3cc3210841e0f 100644 --- a/clang/test/AST/ByteCode/records.cpp +++ b/clang/test/AST/ByteCode/records.cpp @@ -1656,12 +1656,28 @@ namespace ExprWithCleanups { static_assert(F == 1i, ""); } -namespace NullptrUpcast { +namespace NullptrCast { struct A {}; struct B : A { int n; }; + constexpr A *na = nullptr; constexpr B *nb = nullptr; constexpr A &ra = *nb; // both-error {{constant expression}} \ // both-note {{cannot access base class of null pointer}} + constexpr B &rb = (B&)*na; // both-error {{constant expression}} \ + // both-note {{cannot access derived class of null pointer}} + constexpr bool test() { + auto a = (A*)(B*)nullptr; + + return a == nullptr; + } + static_assert(test(), ""); + + constexpr bool test2() { + auto a = (B*)(A*)nullptr; + + return a == nullptr; + } + static_assert(test2(), ""); } namespace NonConst { diff --git a/clang/test/AST/ast-dump-binding-pack.cpp b/clang/test/AST/ast-dump-binding-pack.cpp index 81c75a1268730..c4a353ae72a1b 100644 --- a/clang/test/AST/ast-dump-binding-pack.cpp +++ b/clang/test/AST/ast-dump-binding-pack.cpp @@ -22,10 +22,7 @@ void foo() { // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0 // CHECK-NOT: BindingDecl // CHECK-LABEL: BindingDecl {{.*}} binding_rest -// CHECK-NEXT: ResolvedUnexpandedPackExpr -// CHECK-NEXT: DeclRefExpr {{.*}} lvalue Binding {{.*}} 'binding_rest' -// CHECK-NEXT: DeclRefExpr {{.*}} lvalue Binding {{.*}} 'binding_rest' -// CHECK-NOT: BindingDecl +// CHECK-NEXT: FunctionParmPackExpr // CHECK-LABEL: BindingDecl {{.*}} binding_4 // CHECK-NEXT: ArraySubscriptExpr // CHECK-NEXT: ImplicitCastExpr {{.*}} @@ -47,9 +44,7 @@ void bar() { // CHECK-LABEL: FunctionTemplateDecl {{.*}} bar // CHECK-NOT: BindingDecl // CHECK-LABEL: BindingDecl {{.*}} empty_binding_pack -// CHECK-NEXT: ResolvedUnexpandedPackExpr -// CHECK-NOT: DeclRefExpr {{.*}} 'empty_binding_pack' -// CHECK-NOT: BindingDecl +// CHECK-NEXT: FunctionParmPackExpr // CHECK: DeclStmt struct int_pair { int x; int y; }; @@ -67,8 +62,6 @@ void(*f)() = baz; // CHECK: BindingDecl {{.*}} binding_2 // CHECK-NOT: BindingDecl // CHECK-LABEL: BindingDecl {{.*}} empty_binding_pack -// CHECK-NEXT: ResolvedUnexpandedPackExpr -// CHECK-NOT: DeclRefExpr {{.*}} 'empty_binding_pack' -// CHECK-NOT: BindingDecl +// CHECK-NEXT: FunctionParmPackExpr // CHECK: DeclStmt #endif diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx-std-locks.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx-std-locks.h new file mode 100644 index 0000000000000..054dd5405e1be --- /dev/null +++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx-std-locks.h @@ -0,0 +1,13 @@ +#pragma clang system_header + +namespace std { +struct mutex { + void lock(); + void unlock(); +}; + +template struct lock_guard { + lock_guard(std::mutex &); + ~lock_guard(); +}; +} // namespace std diff --git a/clang/test/Analysis/issue-124474.cpp b/clang/test/Analysis/issue-124474.cpp new file mode 100644 index 0000000000000..ae30c4db552c1 --- /dev/null +++ b/clang/test/Analysis/issue-124474.cpp @@ -0,0 +1,37 @@ +// RUN: %clang_analyze_cc1 \ +// RUN: -analyzer-checker=core,unix.BlockInCriticalSection \ +// RUN: -analyzer-output text -verify %s + +// expected-no-diagnostics + +#include "Inputs/system-header-simulator-cxx-std-locks.h" + +std::mutex mtx; +using ssize_t = long long; +using size_t = unsigned long long; +int open(const char *__file, int __oflag, ...); +ssize_t read(int fd, void *buf, size_t count); +void close(int fd); +#define O_RDONLY 00 +#define O_NONBLOCK 04000 + +void foo() { + std::lock_guard lock(mtx); + + const char *filename = "example.txt"; + int fd = open(filename, O_RDONLY | O_NONBLOCK); + + char buffer[200] = {}; + read(fd, buffer, 199); // no-warning: fd is a non-block file descriptor or equals to -1 + close(fd); +} + +void foo1(int fd) { + std::lock_guard lock(mtx); + + const char *filename = "example.txt"; + char buffer[200] = {}; + if (fd == -1) + read(fd, buffer, 199); // no-warning: consider file descriptor is a symbol equals to -1 + close(fd); +} diff --git a/clang/test/Analysis/live-stmts.cpp b/clang/test/Analysis/live-stmts.cpp index 9cac815e65de1..ca2ff6da8b133 100644 --- a/clang/test/Analysis/live-stmts.cpp +++ b/clang/test/Analysis/live-stmts.cpp @@ -1,6 +1,3 @@ -// Disabling this flaky test, see https://github.com/llvm/llvm-project/pull/126913#issuecomment-2655850766 -// UNSUPPORTED: true - // RUN: %clang_analyze_cc1 -w -analyzer-checker=debug.DumpLiveExprs %s 2>&1\ // RUN: | FileCheck %s @@ -29,36 +26,36 @@ int testThatDumperWorks(int x, int y, int z) { // CHECK-EMPTY: // CHECK: [ B2 (live expressions at block exit) ] // CHECK-EMPTY: -// CHECK-NEXT: ImplicitCastExpr {{.*}} -// CHECK-NEXT: `-ImplicitCastExpr {{.*}} -// CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' -// CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' // CHECK-EMPTY: -// CHECK-EMPTY: -// CHECK: [ B3 (live expressions at block exit) ] -// CHECK-EMPTY: // CHECK-NEXT: ImplicitCastExpr {{.*}} // CHECK-NEXT: `-ImplicitCastExpr {{.*}} // CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' // CHECK-EMPTY: -// CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: -// CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' +// CHECK: [ B3 (live expressions at block exit) ] // CHECK-EMPTY: +// CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: -// CHECK: [ B4 (live expressions at block exit) ] +// CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' // CHECK-EMPTY: // CHECK-NEXT: ImplicitCastExpr {{.*}} // CHECK-NEXT: `-ImplicitCastExpr {{.*}} // CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' // CHECK-EMPTY: +// CHECK-EMPTY: +// CHECK: [ B4 (live expressions at block exit) ] +// CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'y' 'int' // CHECK-EMPTY: // CHECK-NEXT: DeclRefExpr {{.*}} 'z' 'int' // CHECK-EMPTY: +// CHECK-NEXT: ImplicitCastExpr {{.*}} +// CHECK-NEXT: `-ImplicitCastExpr {{.*}} +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'x' 'int' +// CHECK-EMPTY: // CHECK-EMPTY: // CHECK: [ B5 (live expressions at block exit) ] // CHECK-EMPTY: @@ -228,15 +225,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -247,15 +244,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -266,15 +263,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 @@ -285,15 +282,15 @@ int logicalOpInTernary(bool b) { // CHECK: ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: +// CHECK: ImplicitCastExpr {{.*}} '_Bool' +// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' +// CHECK-EMPTY: // CHECK: BinaryOperator {{.*}} '_Bool' '||' // CHECK: |-ImplicitCastExpr {{.*}} '_Bool' // CHECK: | `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK: `-ImplicitCastExpr {{.*}} '_Bool' // CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' // CHECK-EMPTY: -// CHECK: ImplicitCastExpr {{.*}} '_Bool' -// CHECK: `-DeclRefExpr {{.*}} '_Bool' lvalue ParmVar {{.*}} 'b' '_Bool' -// CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 0 // CHECK-EMPTY: // CHECK: IntegerLiteral {{.*}} 'int' 1 diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c index 7a094b8fdc840..7d6cb4ecf1b24 100644 --- a/clang/test/Analysis/out-of-bounds.c +++ b/clang/test/Analysis/out-of-bounds.c @@ -188,29 +188,31 @@ int test_cast_to_unsigned(signed char x) { if (x >= 0) return x; // FIXME: Here the analyzer ignores the signed -> unsigned cast, and manages to - // load a negative value from an unsigned variable. This causes an underflow - // report, which is an ugly false positive. + // load a negative value from an unsigned variable. // The underlying issue is tracked by Github ticket #39492. clang_analyzer_value(y); // expected-warning {{8s:{ [-128, -1] } }} - return table[y]; // expected-warning {{Out of bound access to memory preceding}} + // However, a hack in the ArrayBound checker suppresses the false positive + // underflow report that would be generated here. + return table[y]; // no-warning } int test_cast_to_unsigned_overflow(signed char x) { unsigned char y = x; if (x >= 0) return x; - // A variant of 'test_cast_to_unsigned' where the correct behavior would be - // an overflow report (because the negative values are cast to `unsigned - // char` values that are too large). - // FIXME: See comment in 'test_cast_to_unsigned'. + // FIXME: As in 'test_cast_to_unsigned', the analyzer thinks that this + // unsigned variable contains a negative value. clang_analyzer_value(y); // expected-warning {{8s:{ [-128, -1] } }} - return small_table[y]; // expected-warning {{Out of bound access to memory preceding}} + // FIXME: The following subscript expression should produce an overflow + // report (because negative signed char corresponds to unsigned char >= 128); + // but the hack in ArrayBound just silences reports and cannot "restore" the + // real execution paths. + return small_table[y]; // no-warning } int test_negative_offset_with_unsigned_idx(void) { // An example where the subscript operator uses an unsigned index, but the - // underflow report is still justified. (We should try to keep this if we - // silence false positives like the one in 'test_cast_to_unsigned'.) + // underflow report is still justified. int *p = table - 10; unsigned idx = 2u; return p[idx]; // expected-warning {{Out of bound access to memory preceding}} diff --git a/clang/test/CodeGenObjC/constant-non-fragile-ivar-offset.m b/clang/test/CodeGenObjC/constant-non-fragile-ivar-offset.m index 8d55e6c7d2308..bc076b4656c9d 100644 --- a/clang/test/CodeGenObjC/constant-non-fragile-ivar-offset.m +++ b/clang/test/CodeGenObjC/constant-non-fragile-ivar-offset.m @@ -9,6 +9,9 @@ // CHECK: @"OBJC_IVAR_$_SubClass.subClassIvar" = constant i64 56 // CHECK: @"OBJC_IVAR_$_SubClass._subClassProperty" = hidden constant i64 64 // CHECK: @"OBJC_IVAR_$_NotStaticLayout.not_static_layout_ivar" = hidden global i64 12 +// CHECK: @"OBJC_IVAR_$_SuperClass2._superClassProperty2" = hidden constant i64 20 +// CHECK: @"OBJC_IVAR_$_IntermediateClass2._IntermediateClass2Property" = hidden constant i64 24 +// CHECK: @"OBJC_IVAR_$_SubClass2._subClass2Property" = hidden constant i64 28 @interface NSObject { int these, will, never, change, ever; @@ -138,3 +141,47 @@ -(void)meth { // CHECK: load i64, ptr @"OBJC_IVAR_$_NotStaticLayout.not_static_layout_ivar } @end + +// CHECK: define internal i32 @"\01-[IntermediateClass2 IntermediateClass2Property]"(ptr noundef %[[SELF:.*]], +// CHECK: %[[SELF_ADDR:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[SELF]], ptr %[[SELF_ADDR]], align 8 +// CHECK: %[[V0:.*]] = load ptr, ptr %[[SELF_ADDR]], align 8 +// CHECK: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[V0]], i64 24 +// CHECK: %[[LOAD:.*]] = load atomic i32, ptr %[[ADD_PTR]] unordered, align 4 +// CHECK: ret i32 %[[LOAD]] + +// CHECK: define internal i32 @"\01-[SubClass2 subClass2Property]"(ptr noundef %[[SELF:.*]], +// CHECK: %[[SELF_ADDR:.*]] = alloca ptr, align 8 +// CHECK: store ptr %[[SELF]], ptr %[[SELF_ADDR]], align 8 +// CHECK: %[[V0:.*]] = load ptr, ptr %[[SELF_ADDR]], align 8 +// CHECK: %[[ADD_PTR:.*]] = getelementptr inbounds i8, ptr %[[V0]], i64 28 +// CHECK: %[[LOAD:.*]] = load atomic i32, ptr %[[ADD_PTR]] unordered, align 4 +// CHECK: ret i32 %[[LOAD]] + +@interface SuperClass2 : NSObject +@property int superClassProperty2; +@end + +@interface IntermediateClass2 : SuperClass2 +@property int IntermediateClass2Property; +@end + +@interface IntermediateClass3 : SuperClass2 +@property int IntermediateClass3Property; +@end + +@interface SubClass2 : IntermediateClass2 +@property int subClass2Property; +@end + +@implementation IntermediateClass3 +@end + +@implementation SuperClass2 +@end + +@implementation IntermediateClass2 +@end + +@implementation SubClass2 +@end diff --git a/clang/test/CodeGenObjC/ivar-layout-64.m b/clang/test/CodeGenObjC/ivar-layout-64.m index d3ffdfe444c8b..409434ca3bef3 100644 --- a/clang/test/CodeGenObjC/ivar-layout-64.m +++ b/clang/test/CodeGenObjC/ivar-layout-64.m @@ -63,8 +63,8 @@ @interface D : A @end // CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"D\00" -// CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"\11p\00" -// CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"!`\00" +// CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"\11\A0\00" +// CHECK: @OBJC_CLASS_NAME_{{.*}} = private unnamed_addr constant {{.*}} c"!\90\00" @implementation D @synthesize p3 = _p3; diff --git a/clang/test/Driver/print-supported-cpus-aarch64.c b/clang/test/Driver/print-supported-cpus-aarch64.c index 3c1dcebf7c6c8..3a0ccaf015428 100644 --- a/clang/test/Driver/print-supported-cpus-aarch64.c +++ b/clang/test/Driver/print-supported-cpus-aarch64.c @@ -14,6 +14,7 @@ // CHECK: apple-a15 // CHECK: apple-a16 // CHECK: apple-a17 +// CHECK: apple-a18 // CHECK: apple-a7 // CHECK: apple-a8 // CHECK: apple-a9 @@ -21,7 +22,12 @@ // CHECK: apple-m2 // CHECK: apple-m3 // CHECK: apple-m4 +// CHECK: apple-s10 // CHECK: apple-s4 // CHECK: apple-s5 +// CHECK: apple-s6 +// CHECK: apple-s7 +// CHECK: apple-s8 +// CHECK: apple-s9 // CHECK: Use -mcpu or -mtune to specify the target's processor. diff --git a/clang/test/Index/Core/index-dependent-source.cpp b/clang/test/Index/Core/index-dependent-source.cpp index 8fec9abd1e926..ef414c8fdf7a0 100644 --- a/clang/test/Index/Core/index-dependent-source.cpp +++ b/clang/test/Index/Core/index-dependent-source.cpp @@ -3,7 +3,7 @@ int invalid; class Base { - void baseFunction(); + void baseFunction() const; int baseField; @@ -13,7 +13,7 @@ class Base { template class BaseTemplate { public: - T baseTemplateFunction(); + T baseTemplateFunction() const; T baseTemplateField; @@ -25,7 +25,7 @@ class TemplateClass: public Base , public BaseTemplate { public: ~TemplateClass(); - T function() { } + T function() const { } static void staticFunction() { } @@ -48,27 +48,27 @@ template void indexSimpleDependentDeclarations(const TemplateClass &object) { // Valid instance members: object.function(); -// CHECK: [[@LINE-1]]:10 | instance-method/C++ | function | c:@ST>2#T#T@TemplateClass@F@function# | | Ref,Call,RelCall,RelCont | rel: 1 +// CHECK: [[@LINE-1]]:10 | instance-method/C++ | function | c:@ST>2#T#T@TemplateClass@F@function#1 | | Ref,Call,RelCall,RelCont | rel: 1 object.field; // CHECK: [[@LINE-1]]:10 | field/C++ | field | c:@ST>2#T#T@TemplateClass@FI@field | | Ref,RelCont | rel: 1 object.baseFunction(); -// CHECK: [[@LINE-1]]:10 | instance-method/C++ | baseFunction | c:@S@Base@F@baseFunction# | __ZN4Base12baseFunctionEv | Ref,Call,RelCall,RelCont | rel: 1 +// CHECK: [[@LINE-1]]:10 | instance-method/C++ | baseFunction | c:@S@Base@F@baseFunction#1 | __ZNK4Base12baseFunctionEv | Ref,Call,RelCall,RelCont | rel: 1 object.baseField; // CHECK: [[@LINE-1]]:10 | field/C++ | baseField | c:@S@Base@FI@baseField | | Ref,RelCont | rel: 1 object.baseTemplateFunction(); -// CHECK: [[@LINE-1]]:10 | instance-method/C++ | baseTemplateFunction | c:@ST>1#T@BaseTemplate@F@baseTemplateFunction# | | Ref,Call,RelCall,RelCont | rel: 1 +// CHECK: [[@LINE-1]]:10 | instance-method/C++ | baseTemplateFunction | c:@ST>1#T@BaseTemplate@F@baseTemplateFunction#1 | | Ref,Call,RelCall,RelCont | rel: 1 object.baseTemplateField; // CHECK: [[@LINE-1]]:10 | field/C++ | baseTemplateField | c:@ST>1#T@BaseTemplate@FI@baseTemplateField | | Ref,RelCont | rel: 1 - // Invalid instance members: + // Static members (these are still valid to access via an instance): object.variable; -// CHECK-NOT: [[@LINE-1]]:10 +// CHECK: [[@LINE-1]]:10 | static-property/C++ | variable | c:@ST>2#T#T@TemplateClass@variable | __ZN13TemplateClass8variableE | Ref,RelCont | rel: 1 object.staticFunction(); -// CHECK-NOT: [[@LINE-1]]:10 +// CHECK: [[@LINE-1]]:10 | static-method/C++ | staticFunction | c:@ST>2#T#T@TemplateClass@F@staticFunction#S | | Ref,Call,RelCall,RelCont | rel: 1 object.Struct; -// CHECK-NOT: [[@LINE-1]]:10 +// CHECK: [[@LINE-1]]:10 | struct/C | Struct | c:@ST>2#T#T@TemplateClass@S@Struct | | Ref,RelCont | rel: 1 object.EnumValue; -// CHECK-NOT: [[@LINE-1]]:10 +// CHECK: [[@LINE-1]]:10 | enumerator/C | EnumValue | c:@ST>2#T#T@TemplateClass@E@Enum@EnumValue | | Ref,RelCont | rel: 1 // Valid static members: TemplateClass::staticFunction(); diff --git a/clang/test/Misc/target-invalid-cpu-note/aarch64.c b/clang/test/Misc/target-invalid-cpu-note/aarch64.c index e6ff09557fe07..98a2ca0447bcf 100644 --- a/clang/test/Misc/target-invalid-cpu-note/aarch64.c +++ b/clang/test/Misc/target-invalid-cpu-note/aarch64.c @@ -19,6 +19,7 @@ // CHECK-SAME: {{^}}, apple-a15 // CHECK-SAME: {{^}}, apple-a16 // CHECK-SAME: {{^}}, apple-a17 +// CHECK-SAME: {{^}}, apple-a18 // CHECK-SAME: {{^}}, apple-a7 // CHECK-SAME: {{^}}, apple-a8 // CHECK-SAME: {{^}}, apple-a9 @@ -26,8 +27,13 @@ // CHECK-SAME: {{^}}, apple-m2 // CHECK-SAME: {{^}}, apple-m3 // CHECK-SAME: {{^}}, apple-m4 +// CHECK-SAME: {{^}}, apple-s10 // CHECK-SAME: {{^}}, apple-s4 // CHECK-SAME: {{^}}, apple-s5 +// CHECK-SAME: {{^}}, apple-s6 +// CHECK-SAME: {{^}}, apple-s7 +// CHECK-SAME: {{^}}, apple-s8 +// CHECK-SAME: {{^}}, apple-s9 // CHECK-SAME: {{^}}, carmel // CHECK-SAME: {{^}}, cobalt-100 // CHECK-SAME: {{^}}, cortex-a34 diff --git a/clang/test/Preprocessor/riscv-cf-protection-return.c b/clang/test/Preprocessor/riscv-cf-protection-return.c new file mode 100644 index 0000000000000..3a93a88fa6839 --- /dev/null +++ b/clang/test/Preprocessor/riscv-cf-protection-return.c @@ -0,0 +1,44 @@ +// RUN: %clang --target=riscv32 -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv32 -fcf-protection=return -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv32 -fcf-protection=full -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv32 -march=rv32i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv32 -march=rv32i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -fcf-protection=return -E -dM %s \ +// RUN: -o - | FileCheck --check-prefixes=SHSTK-MACRO %s + +// RUN: %clang --target=riscv32 -march=rv32i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -fcf-protection=full -E -dM %s -o - \ +// RUN: | FileCheck --check-prefixes=SHSTK-MACRO %s + +// RUN: %clang --target=riscv64 -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv64 -fcf-protection=return -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv64 -fcf-protection=full -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv64 -march=rv64i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -E -dM %s -o - | \ +// RUN: FileCheck --check-prefixes=NO-MACRO %s + +// RUN: %clang --target=riscv64 -march=rv64i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -fcf-protection=return -E -dM %s \ +// RUN: -o - | FileCheck --check-prefixes=SHSTK-MACRO %s + +// RUN: %clang --target=riscv64 -march=rv64i_zicfiss1p0 \ +// RUN: -menable-experimental-extensions -fcf-protection=full -E -dM %s -o - \ +// RUN: | FileCheck --check-prefixes=SHSTK-MACRO %s + +// SHSTK-MACRO: __riscv_shadow_stack 1{{$}} +// NO-MACRO-NOT: __riscv_shadow_stack diff --git a/clang/test/Sema/Inputs/lifetime-analysis.h b/clang/test/Sema/Inputs/lifetime-analysis.h index d318033ff0cc4..2072e4603cead 100644 --- a/clang/test/Sema/Inputs/lifetime-analysis.h +++ b/clang/test/Sema/Inputs/lifetime-analysis.h @@ -61,6 +61,7 @@ struct basic_string_view { basic_string_view(); basic_string_view(const T *); const T *begin() const; + const T *data() const; }; using string_view = basic_string_view; @@ -80,6 +81,7 @@ struct basic_string { const T *c_str() const; operator basic_string_view () const; using const_iterator = iter; + const T *data() const; }; using string = basic_string; diff --git a/clang/test/Sema/gh106576.c b/clang/test/Sema/gh106576.c new file mode 100644 index 0000000000000..a72592aac0129 --- /dev/null +++ b/clang/test/Sema/gh106576.c @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +typedef _Atomic char atomic_char; + +atomic_char counter; + +char load_plus_one(void) { + return ({counter;}) + 1; // no crash +} + +char type_of_stmt_expr(void) { + typeof(({counter;})) y = ""; // expected-error-re {{incompatible pointer to integer conversion initializing 'typeof (({{{.*}}}))' (aka 'char') with an expression of type 'char[1]'}} + return y; +} diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index 04bb1330ded4c..66a2a19ceb321 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -852,3 +852,27 @@ struct Test { }; } // namespace GH120543 + +namespace GH127195 { +template +struct StatusOr { + T* operator->() [[clang::lifetimebound]]; + T* value() [[clang::lifetimebound]]; +}; + +const char* foo() { + StatusOr s; + return s->data(); // expected-warning {{address of stack memory associated with local variable}} + + StatusOr s2; + return s2->data(); + + StatusOr> s3; + return s3.value()->value()->data(); + + // FIXME: nested cases are not supported now. + StatusOr> s4; + return s4.value()->value()->data(); +} + +} // namespace GH127195 diff --git a/clang/test/SemaCXX/cxx2c-binding-pack.cpp b/clang/test/SemaCXX/cxx2c-binding-pack.cpp index 5ca249f52b3d8..62e1da565f2b5 100644 --- a/clang/test/SemaCXX/cxx2c-binding-pack.cpp +++ b/clang/test/SemaCXX/cxx2c-binding-pack.cpp @@ -59,6 +59,7 @@ template void decompose_struct() { T obj{1, 2, 3, 6}; auto [x, ...rest, y] = obj; + static_assert(sizeof...(rest) == 2); auto [...empty] = type_{}; static_assert(sizeof...(empty) == 0); @@ -124,6 +125,14 @@ void lambda_capture() { [&x...] { (void)sum(x...); }(); } +struct S2 { + int a, b, c; +}; + +auto X = [] () { + auto [...pack] = S2{}; +}; + int main() { decompose_array(); decompose_tuple(); @@ -133,6 +142,8 @@ int main() { lambda_capture(); lambda_capture(); lambda_capture(); + X(); + } // P1061R10 Stuff @@ -188,3 +199,22 @@ void other_main() { static_assert(f() == 2); } } // namespace + +namespace { +struct S { + int a,b,c; +}; + +clsss S2 { // expected-error{{{unknown type name 'clsss'}}} +public: + int a,b,c; +}; + +// Should not crash. +auto X = [] () { + auto [...pack,a,b,c] = S{}; + auto [x,y,z,...pack2] = S{}; + auto [...pack3] = S2{}; + static_assert(sizeof...(pack3) == 5); +}; +} // namespace diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 9cdb1eae56187..3bdeb461e4bfa 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -298,12 +298,14 @@ class ResourceDirectoryCache { }; if (llvm::sys::ExecuteAndWait(ClangBinaryPath, PrintResourceDirArgs, {}, Redirects)) { - auto ErrorBuf = llvm::MemoryBuffer::getFile(ErrorFile.c_str()); + auto ErrorBuf = + llvm::MemoryBuffer::getFile(ErrorFile.c_str(), /*IsText=*/true); llvm::errs() << ErrorBuf.get()->getBuffer(); return ""; } - auto OutputBuf = llvm::MemoryBuffer::getFile(OutputFile.c_str()); + auto OutputBuf = + llvm::MemoryBuffer::getFile(OutputFile.c_str(), /*IsText=*/true); if (!OutputBuf) return ""; StringRef Output = OutputBuf.get()->getBuffer().rtrim('\n'); @@ -1032,7 +1034,8 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { std::unique_ptr TU; std::optional TUBuffer; if (!TranslationUnitFile.empty()) { - auto MaybeTU = llvm::MemoryBuffer::getFile(TranslationUnitFile); + auto MaybeTU = + llvm::MemoryBuffer::getFile(TranslationUnitFile, /*IsText=*/true); if (!MaybeTU) { llvm::errs() << "cannot open input translation unit: " << MaybeTU.getError().message() << "\n"; diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp index 9ca0ce36bb7f2..0810c38bb751b 100644 --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -338,7 +338,6 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent, case Stmt::EmbedExprClass: case Stmt::HLSLOutArgExprClass: case Stmt::OpenACCAsteriskSizeExprClass: - case Stmt::ResolvedUnexpandedPackExprClass: K = CXCursor_UnexposedExpr; break; diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 2365a7c40bf76..d6d028436d39c 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -29028,6 +29028,11 @@ TEST_F(FormatTest, WrapNamespaceBodyWithEmptyLinesAlways) { Style); } +TEST_F(FormatTest, BreakBeforeClassName) { + verifyFormat("class ABSL_ATTRIBUTE_TRIVIAL_ABI ABSL_NULLABILITY_COMPATIBLE\n" + " ArenaSafeUniquePtr {};"); +} + } // namespace } // namespace test } // namespace format diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 1d0870c818acc..8ada6c3daeaf6 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3250,6 +3250,10 @@ TEST_F(TokenAnnotatorTest, StartOfName) { EXPECT_TOKEN(Tokens[0], tok::at, TT_ObjCDecl); EXPECT_TOKEN(Tokens[2], tok::identifier, TT_StartOfName); + Tokens = annotate("class FOO BAR C {};"); + ASSERT_EQ(Tokens.size(), 8u) << Tokens; + EXPECT_TOKEN(Tokens[2], tok::identifier, TT_Unknown); // Not StartOfName + auto Style = getLLVMStyle(); Style.StatementAttributeLikeMacros.push_back("emit"); Tokens = annotate("emit foo = 0;", Style); diff --git a/compiler-rt/lib/builtins/arm/negdf2vfp.S b/compiler-rt/lib/builtins/arm/negdf2vfp.S index b7cf91877e38c..329c6de757f68 100644 --- a/compiler-rt/lib/builtins/arm/negdf2vfp.S +++ b/compiler-rt/lib/builtins/arm/negdf2vfp.S @@ -20,7 +20,11 @@ DEFINE_COMPILERRT_FUNCTION(__negdf2vfp) #if defined(COMPILER_RT_ARMHF_TARGET) vneg.f64 d0, d0 #else - eor r1, r1, #-2147483648 // flip sign bit on double in r0/r1 pair +#if _YUGA_BIG_ENDIAN + eor r0, r0, #0x80000000 // flip sign bit on double in r0/r1 pair +#else + eor r1, r1, #0x80000000 // flip sign bit on double in r0/r1 pair +#endif #endif bx lr END_COMPILERRT_FUNCTION(__negdf2vfp) diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 503d159fd9817..e5eca7947cf9b 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -1362,12 +1362,10 @@ COMPILER_RT_VISIBILITY int __llvm_profile_set_file_object(FILE *File, return 0; } -int __llvm_write_custom_profile(const char *Target, - const __llvm_profile_data *DataBegin, - const __llvm_profile_data *DataEnd, - const char *CountersBegin, - const char *CountersEnd, const char *NamesBegin, - const char *NamesEnd) { +COMPILER_RT_USED int __llvm_write_custom_profile( + const char *Target, const __llvm_profile_data *DataBegin, + const __llvm_profile_data *DataEnd, const char *CountersBegin, + const char *CountersEnd, const char *NamesBegin, const char *NamesEnd) { int ReturnValue = 0, FilenameLength, TargetLength; char *FilenameBuf, *TargetFilename; const char *Filename; diff --git a/compiler-rt/test/orc/TestCases/Generic/Inputs/bar-ret-void-weak.ll b/compiler-rt/test/orc/TestCases/Generic/Inputs/bar-ret-void-weak.ll new file mode 100644 index 0000000000000..7301b43e7c92d --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Generic/Inputs/bar-ret-void-weak.ll @@ -0,0 +1,4 @@ +define weak void @bar() { +entry: + ret void +} diff --git a/compiler-rt/test/orc/TestCases/Generic/Inputs/baz-ret-void-hidden.ll b/compiler-rt/test/orc/TestCases/Generic/Inputs/baz-ret-void-hidden.ll new file mode 100644 index 0000000000000..27e19deea6ebd --- /dev/null +++ b/compiler-rt/test/orc/TestCases/Generic/Inputs/baz-ret-void-hidden.ll @@ -0,0 +1,4 @@ +define hidden void @baz() { +entry: + ret void +} diff --git a/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll b/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll index 5a8dbfc532b0f..1c375bcf1e62f 100644 --- a/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll +++ b/compiler-rt/test/orc/TestCases/Generic/lazy-link.ll @@ -6,9 +6,11 @@ ; RUN: rm -rf %t && mkdir -p %t ; RUN: %clang -c -o %t/foo.o %S/Inputs/foo-ret-42.ll ; RUN: %clang -c -o %t/x.o %S/Inputs/var-x-42.ll +; RUN: %clang -c -o %t/bar.o %S/Inputs/bar-ret-void-weak.ll +; RUN: %clang -c -o %t/baz.o %S/Inputs/baz-ret-void-hidden.ll ; RUN: %clang -c -o %t/main.o %s ; RUN: %llvm_jitlink -noexec -show-linked-files %t/main.o -lazy %t/foo.o \ -; RUN: -lazy %t/x.o | FileCheck %s +; RUN: -lazy %t/x.o -lazy %t/bar.o -lazy %t/baz.o | FileCheck %s ; ; UNSUPPORTED: system-windows ; REQUIRES: target={{(arm|aarch|x86_)64.*}} @@ -21,9 +23,15 @@ declare i32 @foo() @x = external global i32 +declare void @bar() +declare hidden void @baz() + + define i32 @main(i32 %argc, ptr %argv) { entry: %foo_result = call i32 @foo() + call void @bar() + call void @baz() %x_val = load i32, ptr @x %result = add nsw i32 %foo_result, %x_val ret i32 %result diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt index df35e24ec28a7..50d1a5cb2a591 100644 --- a/flang-rt/CMakeLists.txt +++ b/flang-rt/CMakeLists.txt @@ -115,6 +115,15 @@ endif () extend_path(FLANG_RT_INSTALL_RESOURCE_LIB_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH}" "${toolchain_lib_subdir}") cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_DIR) cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_PATH) +# FIXME: For the libflang_rt.so, the toolchain resource lib dir is not a good +# destination because it is not a ld.so default search path. +# The machine where the executable is eventually executed may not be the +# machine where the Flang compiler and its resource dir is installed, so +# setting RPath by the driver is not an solution. It should belong into +# /usr/lib//libflang_rt.so, like e.g. libgcc_s.so. +# But the linker as invoked by the Flang driver also requires +# libflang_rt.so to be found when linking and the resource lib dir is +# the only reliable location. cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_LIB_DIR) cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH) @@ -129,6 +138,27 @@ cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH) option(FLANG_RT_INCLUDE_TESTS "Generate build targets for the flang-rt unit and regression-tests." "${LLVM_INCLUDE_TESTS}") +option(FLANG_RT_ENABLE_STATIC "Build Flang-RT as a static library." ON) +if (WIN32) + # Windows DLL currently not implemented. + set(FLANG_RT_ENABLE_SHARED OFF) +else () + # TODO: Enable by default to increase test coverage, and which version of the + # library should be the user's choice anyway. + # Currently, the Flang driver adds `-L"libdir" -lflang_rt` as linker + # argument, which leaves the choice which library to use to the linker. + # Since most linkers prefer the shared library, this would constitute a + # breaking change unless the driver is changed. + option(FLANG_RT_ENABLE_SHARED "Build Flang-RT as a shared library." OFF) +endif () +if (NOT FLANG_RT_ENABLE_STATIC AND NOT FLANG_RT_ENABLE_SHARED) + message(FATAL_ERROR " + Must build at least one type of library + (FLANG_RT_ENABLE_STATIC=ON, FLANG_RT_ENABLE_SHARED=ON, or both) + ") +endif () + + set(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT "" CACHE STRING "Compile Flang-RT with GPU support (CUDA or OpenMP)") set_property(CACHE FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT PROPERTY STRINGS "" diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake index 630aeb3c65005..a43f1c332187a 100644 --- a/flang-rt/cmake/modules/AddFlangRT.cmake +++ b/flang-rt/cmake/modules/AddFlangRT.cmake @@ -16,7 +16,8 @@ # STATIC # Build a static (.a/.lib) library # OBJECT -# Create only object files without static/dynamic library +# Always create an object library. +# Without SHARED/STATIC, build only the object library. # INSTALL_WITH_TOOLCHAIN # Install library into Clang's resource directory so it can be found by the # Flang driver during compilation, including tests @@ -50,17 +51,73 @@ function (add_flangrt_library name) ") endif () - # Forward libtype to add_library - set(extra_args "") - if (ARG_SHARED) - list(APPEND extra_args SHARED) + # Internal names of libraries. If called with just single type option, use + # the default name for it. Name of targets must only depend on function + # arguments to be predictable for callers. + set(name_static "${name}.static") + set(name_shared "${name}.shared") + set(name_object "obj.${name}") + if (ARG_STATIC AND NOT ARG_SHARED) + set(name_static "${name}") + elseif (NOT ARG_STATIC AND ARG_SHARED) + set(name_shared "${name}") + elseif (NOT ARG_STATIC AND NOT ARG_SHARED AND ARG_OBJECT) + set(name_object "${name}") + elseif (NOT ARG_STATIC AND NOT ARG_SHARED AND NOT ARG_OBJECT) + # Only one of them will actually be built. + set(name_static "${name}") + set(name_shared "${name}") + endif () + + # Determine what to build. If not explicitly specified, honor + # BUILD_SHARED_LIBS (e.g. for unittest libraries). If can build static and + # shared, use ENABLE_STATIC/ENABLE_SHARED setting. + if (ARG_STATIC AND ARG_SHARED) + set(build_static ${FLANG_RT_ENABLE_STATIC}) + set(build_shared ${FLANG_RT_ENABLE_SHARED}) + else () + set(build_static ${ARG_STATIC}) + set(build_shared ${ARG_SHARED}) endif () - if (ARG_STATIC) - list(APPEND extra_args STATIC) + if (NOT ARG_STATIC AND NOT ARG_SHARED AND NOT ARG_OBJECT) + if (BUILD_SHARED_LIBS) + set(build_shared ON) + else () + set(build_static ON) + endif () endif () + + # Build an object library if building multiple libraries at once or if + # explicitly requested. + set(build_object OFF) if (ARG_OBJECT) - list(APPEND extra_args OBJECT) + set(build_object ON) + elseif (build_static AND build_shared) + set(build_object ON) endif () + + # srctargets: targets that contain source files + # libtargets: static/shared if they are built + # alltargets: any add_library target added by this function + set(srctargets "") + set(libtargets "") + set(alltargets "") + if (build_static) + list(APPEND srctargets "${name_static}") + list(APPEND libtargets "${name_static}") + list(APPEND alltargets "${name_static}") + endif () + if (build_shared) + list(APPEND srctargets "${name_shared}") + list(APPEND libtargets "${name_shared}") + list(APPEND alltargets "${name_shared}") + endif () + if (build_object) + set(srctargets "${name_object}") + list(APPEND alltargets "${name_object}") + endif () + + set(extra_args "") if (ARG_EXCLUDE_FROM_ALL) list(APPEND extra_args EXCLUDE_FROM_ALL) endif () @@ -68,132 +125,191 @@ function (add_flangrt_library name) # Also add header files to IDEs to list as part of the library. set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON) - add_library(${name} ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS}) + # Create selected library types. + if (build_object) + add_library("${name_object}" OBJECT ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS}) + set_target_properties(${name_object} PROPERTIES + POSITION_INDEPENDENT_CODE ON + FOLDER "Flang-RT/Object Libraries" + ) - if (ARG_INSTALL_WITH_TOOLCHAIN) - set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Toolchain Libraries") - elseif (ARG_OBJECT) - set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Object Libraries") - else () - set_target_properties(${name} PROPERTIES FOLDER "Flang-RT/Libraries") + # Replace arguments for the libraries we are going to create. + set(ARG_ADDITIONAL_HEADERS "") + set(ARG_UNPARSED_ARGUMENTS "$") + endif () + if (build_static) + add_library("${name_static}" STATIC ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS}) + endif () + if (build_shared) + add_library("${name_shared}" SHARED ${extra_args} ${ARG_ADDITIONAL_HEADERS} ${ARG_UNPARSED_ARGUMENTS}) endif () - # Minimum required C++ version for Flang-RT, even if CMAKE_CXX_STANDARD is defined to something else. - target_compile_features(${name} PRIVATE cxx_std_17) + if (libtargets) + # Provide a default alias which exists in either setting. + if (BUILD_SHARED_LIBS) + if (build_shared) + set(default_target "${name_shared}") + else () + set(default_target "${name_static}") + endif () + else () + if (build_static) + set(default_target "${name_static}") + else () + set(default_target "${name_shared}") + endif () + endif () + add_library(${name}.default ALIAS "${default_target}") - # Use compiler-specific options to disable exceptions and RTTI. - if (LLVM_COMPILER_IS_GCC_COMPATIBLE) - target_compile_options(${name} PRIVATE - $<$:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables> - ) - elseif (MSVC) - target_compile_options(${name} PRIVATE - $<$:/EHs-c- /GR-> - ) - elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL") - target_compile_options(${name} PRIVATE - $<$:-qnoeh -qnortti> - ) + # Provide a build target that builds any enabled library. + # Not intended for target_link_libraries. Either use the ${name}.static, + # ${name}.shared variants, or ${name}.default to let BUILD_SHARED_LIBS + # decide. + if (NOT TARGET ${name}) + add_custom_target(${name}) + add_dependencies(${name} ${libtargets}) + endif () endif () - # Also for CUDA source when compiling with FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA - if (CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA") - # Assuming gcc as host compiler. - target_compile_options(${name} PRIVATE - $<$:--no-exceptions -Xcompiler -fno-rtti -Xcompiler -fno-unwind-tables -Xcompiler -fno-asynchronous-unwind-tables> - ) - else () - # Assuming a clang-compatible CUDA compiler. - target_compile_options(${name} PRIVATE - $<$:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables> - ) - endif () + foreach (tgtname IN LISTS libtargets) + if (NOT WIN32) + # Use same stem name for .a and .so. Common in UNIX environments. + # Not possible in Windows environments. + set_target_properties(${tgtname} PROPERTIES OUTPUT_NAME "${name}") + endif () + + if (ARG_INSTALL_WITH_TOOLCHAIN) + set_target_properties(${tgtname} PROPERTIES FOLDER "Flang-RT/Toolchain Libraries") + else () + set_target_properties(${tgtname} PROPERTIES FOLDER "Flang-RT/Libraries") + endif () + endforeach () - # Flang-RT's public headers - target_include_directories(${name} PUBLIC "${FLANG_RT_SOURCE_DIR}/include") + # Define how to compile and link the library. + # Some conceptionally only apply to ${srctargets} or ${libtargets}, but we + # apply them to ${alltargets}. In worst case, they are ignored by CMake. + foreach (tgtname IN LISTS alltargets) + # Minimum required C++ version for Flang-RT, even if CMAKE_CXX_STANDARD is defined to something else. + target_compile_features(${tgtname} PRIVATE cxx_std_17) + + # Use compiler-specific options to disable exceptions and RTTI. + if (LLVM_COMPILER_IS_GCC_COMPATIBLE) + target_compile_options(${tgtname} PRIVATE + $<$:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables> + ) + elseif (MSVC) + target_compile_options(${tgtname} PRIVATE + $<$:/EHs-c- /GR-> + ) + elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL") + target_compile_options(${tgtname} PRIVATE + $<$:-qnoeh -qnortti> + ) + endif () - # For ISO_Fortran_binding.h to be found by the runtime itself (Accessed as #include "flang/ISO_Fortran_binding.h") - # User applications can use #include - target_include_directories(${name} PUBLIC "${FLANG_SOURCE_DIR}/include") + # Also for CUDA source when compiling with FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT=CUDA + if (CMAKE_CUDA_COMPILER_ID MATCHES "NVIDIA") + # Assuming gcc as host compiler. + target_compile_options(${tgtname} PRIVATE + $<$:--no-exceptions -Xcompiler -fno-rtti -Xcompiler -fno-unwind-tables -Xcompiler -fno-asynchronous-unwind-tables> + ) + else () + # Assuming a clang-compatible CUDA compiler. + target_compile_options(${tgtname} PRIVATE + $<$:-fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables> + ) + endif () - # For Flang-RT's configured config.h to be found - target_include_directories(${name} PRIVATE "${FLANG_RT_BINARY_DIR}") + # Flang-RT's public headers + target_include_directories(${tgtname} PUBLIC "${FLANG_RT_SOURCE_DIR}/include") - # Disable libstdc++/libc++ assertions, even in an LLVM_ENABLE_ASSERTIONS - # build, to avoid an unwanted dependency on libstdc++/libc++.so. - if (FLANG_RT_SUPPORTS_UNDEFINE_FLAG) - target_compile_options(${name} PUBLIC -U_GLIBCXX_ASSERTIONS) - target_compile_options(${name} PUBLIC -U_LIBCPP_ENABLE_ASSERTIONS) - endif () + # For ISO_Fortran_binding.h to be found by the runtime itself (Accessed as #include "flang/ISO_Fortran_binding.h") + # User applications can use #include + target_include_directories(${tgtname} PUBLIC "${FLANG_SOURCE_DIR}/include") - # When building the flang runtime if LTO is enabled the archive file - # contains LLVM IR rather than object code. Currently flang is not - # LTO aware so cannot link this file to compiled Fortran code. - if (FLANG_RT_HAS_FNO_LTO_FLAG) - target_compile_options(${name} PRIVATE -fno-lto) - endif () + # For Flang-RT's configured config.h to be found + target_include_directories(${tgtname} PRIVATE "${FLANG_RT_BINARY_DIR}") - # Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI - # should only depend on msvcrt/ucrt. LLVM still emits libgcc/compiler-rt - # functions in some cases like 128-bit integer math (__udivti3, __modti3, - # __fixsfti, __floattidf, ...) that msvc does not support. We are injecting a - # dependency to Compiler-RT's builtin library where these are implemented. - if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang") - if (FLANG_RT_BUILTINS_LIBRARY) - target_compile_options(${name} PRIVATE "$<$:-Xclang>" "$<$:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>") + # Disable libstdc++/libc++ assertions, even in an LLVM_ENABLE_ASSERTIONS + # build, to avoid an unwanted dependency on libstdc++/libc++.so. + if (FLANG_RT_SUPPORTS_UNDEFINE_FLAG) + target_compile_options(${tgtname} PUBLIC -U_GLIBCXX_ASSERTIONS) + target_compile_options(${tgtname} PUBLIC -U_LIBCPP_ENABLE_ASSERTIONS) endif () - endif () - if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") - if (FLANG_RT_BUILTINS_LIBRARY) - target_compile_options(${name} PRIVATE "$<$:-Xflang>" "$<$:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>") - else () - message(WARNING "Did not find libclang_rt.builtins.lib. - LLVM may emit builtins that are not implemented in msvcrt/ucrt and - instead falls back to builtins from Compiler-RT. Linking with ${name} - may result in a linker error.") + + # When building the flang runtime if LTO is enabled the archive file + # contains LLVM IR rather than object code. Currently flang is not + # LTO aware so cannot link this file to compiled Fortran code. + if (FLANG_RT_HAS_FNO_LTO_FLAG) + target_compile_options(${tgtname} PRIVATE -fno-lto) endif () - endif () - # Non-GTest unittests depend on LLVMSupport - if (ARG_LINK_TO_LLVM) - if (LLVM_LINK_LLVM_DYLIB) - set(llvm_libs LLVM) - else() - llvm_map_components_to_libnames(llvm_libs Support) - endif() - target_link_libraries(${name} PUBLIC ${llvm_libs}) - target_include_directories(${name} PUBLIC ${LLVM_INCLUDE_DIRS}) - endif () + # Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI + # should only depend on msvcrt/ucrt. LLVM still emits libgcc/compiler-rt + # functions in some cases like 128-bit integer math (__udivti3, __modti3, + # __fixsfti, __floattidf, ...) that msvc does not support. We are injecting a + # dependency to Compiler-RT's builtin library where these are implemented. + if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang") + if (FLANG_RT_BUILTINS_LIBRARY) + target_compile_options(${tgtname} PRIVATE "$<$:-Xclang>" "$<$:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>") + endif () + endif () + if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + if (FLANG_RT_BUILTINS_LIBRARY) + target_compile_options(${tgtname} PRIVATE "$<$:-Xflang>" "$<$:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>") + else () + message(WARNING "Did not find libclang_rt.builtins.lib. + LLVM may emit builtins that are not implemented in msvcrt/ucrt and + instead falls back to builtins from Compiler-RT. Linking with ${tgtname} + may result in a linker error.") + endif () + endif () - if (ARG_INCLUDE_DIRECTORIES) - target_include_directories(${name} ${ARG_INCLUDE_DIRECTORIES}) - endif () + # Non-GTest unittests depend on LLVMSupport + if (ARG_LINK_TO_LLVM) + if (LLVM_LINK_LLVM_DYLIB) + set(llvm_libs LLVM) + else() + llvm_map_components_to_libnames(llvm_libs Support) + endif() + target_link_libraries(${tgtname} PUBLIC ${llvm_libs}) + target_include_directories(${tgtname} PUBLIC ${LLVM_INCLUDE_DIRS}) + endif () - if (ARG_LINK_LIBRARIES) - target_link_libraries(${name} PUBLIC ${ARG_LINK_LIBRARIES}) - endif () + if (ARG_INCLUDE_DIRECTORIES) + target_include_directories(${tgtname} ${ARG_INCLUDE_DIRECTORIES}) + endif () - # If this is part of the toolchain, put it into the compiler's resource - # directory. Otherwise it is part of testing and is not installed at all. - # TODO: Consider multi-configuration builds (MSVC_IDE, "Ninja Multi-Config") - if (ARG_INSTALL_WITH_TOOLCHAIN) - set_target_properties(${name} - PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}" - ) + if (ARG_LINK_LIBRARIES) + target_link_libraries(${tgtname} PUBLIC ${ARG_LINK_LIBRARIES}) + endif () + endforeach () - install(TARGETS ${name} - ARCHIVE DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}" - ) - endif () + foreach (tgtname IN LISTS libtargets) + # If this is part of the toolchain, put it into the compiler's resource + # directory. Otherwise it is part of testing and is not installed at all. + # TODO: Consider multi-configuration builds (MSVC_IDE, "Ninja Multi-Config") + if (ARG_INSTALL_WITH_TOOLCHAIN) + set_target_properties(${tgtname} + PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}" + LIBRARY_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}" + ) - if (ARG_TARGET_PROPERTIES) - set_target_properties(${name} PROPERTIES ${ARG_TARGET_PROPERTIES}) - endif () + install(TARGETS ${tgtname} + ARCHIVE DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}" + LIBRARY DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}" + ) + endif () - # flang-rt should build all the Flang-RT targets that are built in an - # 'all' build. - if (NOT ARG_EXCLUDE_FROM_ALL) - add_dependencies(flang-rt ${name}) - endif () + if (ARG_TARGET_PROPERTIES) + set_target_properties(${tgtname} PROPERTIES ${ARG_TARGET_PROPERTIES}) + endif () + + # flang-rt should build all the Flang-RT targets that are built in an + # 'all' build. + if (NOT ARG_EXCLUDE_FROM_ALL) + add_dependencies(flang-rt ${tgtname}) + endif () + endforeach () endfunction (add_flangrt_library) diff --git a/flang-rt/cmake/modules/AddFlangRTOffload.cmake b/flang-rt/cmake/modules/AddFlangRTOffload.cmake index 4e4bd60c63545..6dd0d72dc3fd7 100644 --- a/flang-rt/cmake/modules/AddFlangRTOffload.cmake +++ b/flang-rt/cmake/modules/AddFlangRTOffload.cmake @@ -8,9 +8,15 @@ macro(enable_cuda_compilation name files) if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA") + if (FLANG_RT_ENABLE_SHARED) + message(FATAL_ERROR + "FLANG_RT_ENABLE_SHARED is not supported for CUDA offload build of Flang-RT" + ) + endif() + enable_language(CUDA) - set_target_properties(${name} + set_target_properties(${name}.static PROPERTIES CUDA_SEPARABLE_COMPILATION ON ) @@ -54,7 +60,7 @@ macro(enable_cuda_compilation name files) # When using libcudacxx headers files, we have to use them # for all files of Flang-RT. if (EXISTS "${FLANG_RT_LIBCUDACXX_PATH}/include") - foreach (tgt IN ITEMS "${name}" "obj.${name}PTX") + foreach (tgt IN ITEMS "${name}.static" "obj.${name}PTX") target_include_directories(${tgt} AFTER PRIVATE "${FLANG_RT_LIBCUDACXX_PATH}/include") target_compile_definitions(${tgt} PRIVATE RT_USE_LIBCUDACXX=1) endforeach () @@ -66,6 +72,12 @@ macro(enable_omp_offload_compilation name files) if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP") # OpenMP offload build only works with Clang compiler currently. + if (FLANG_RT_ENABLE_SHARED) + message(FATAL_ERROR + "FLANG_RT_ENABLE_SHARED is not supported for OpenMP offload build of Flang-RT" + ) + endif() + if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND "${CMAKE_C_COMPILER_ID}" MATCHES "Clang") @@ -84,7 +96,7 @@ macro(enable_omp_offload_compilation name files) set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS "${OMP_COMPILE_OPTIONS}" ) - target_link_options(${name} PUBLIC ${OMP_COMPILE_OPTIONS}) + target_link_options(${name}.static PUBLIC ${OMP_COMPILE_OPTIONS}) # Enable "declare target" in the source code. set_source_files_properties(${files} diff --git a/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt b/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt index 4fd04f8f2769a..ccc39242745d9 100644 --- a/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt +++ b/flang-rt/examples/ExternalHelloWorld/CMakeLists.txt @@ -13,5 +13,5 @@ add_llvm_example(external-hello-world target_link_libraries(external-hello-world PRIVATE - flang_rt.runtime + flang_rt.runtime.default ) diff --git a/flang-rt/lib/cuda/CMakeLists.txt b/flang-rt/lib/cuda/CMakeLists.txt index d5ca354c1029f..fc9a95bc49dc5 100644 --- a/flang-rt/lib/cuda/CMakeLists.txt +++ b/flang-rt/lib/cuda/CMakeLists.txt @@ -6,8 +6,7 @@ # #===------------------------------------------------------------------------===# - -add_flangrt_library(flang_rt.cuda STATIC +add_flangrt_library(flang_rt.cuda STATIC SHARED allocatable.cpp allocator.cpp descriptor.cpp @@ -17,18 +16,27 @@ add_flangrt_library(flang_rt.cuda STATIC memory.cpp registration.cpp - # libflang_rt.runtime depends on a certain version of CUDA. To be able to have - # multiple build of this library with different CUDA version, the version is - # added to the library name. TARGET_PROPERTIES + # libflang_rt.runtime depends on a certain version of CUDA. To be able to have + # multiple build of this library with different CUDA version, the version is + # added to the library name. OUTPUT_NAME "flang_rt.cuda_${CUDAToolkit_VERSION_MAJOR}" - INCLUDE_DIRECTORIES PRIVATE ${CUDAToolkit_INCLUDE_DIRS} ) -target_link_libraries(flang_rt.cuda - PUBLIC - flang_rt.runtime - CUDA::cudart_static -) +# For the static library, link-in the static dependencies as well. +if (TARGET flang_rt.cuda.static) + target_link_libraries(flang_rt.cuda.static PUBLIC + flang_rt.runtime.static + CUDA::cudart_static + ) +endif () + +# For the shared library, use the shared versions of the dependencies. +if (TARGET flang_rt.cuda.shared) + target_link_libraries(flang_rt.cuda.shared PUBLIC + flang_rt.runtime.shared + CUDA::cudart + ) +endif () diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt index 0afcbf2783533..589ee140485ec 100644 --- a/flang-rt/lib/runtime/CMakeLists.txt +++ b/flang-rt/lib/runtime/CMakeLists.txt @@ -128,7 +128,7 @@ set(sources ${supported_sources} ${host_sources} ${f128_sources}) if (NOT WIN32) - add_flangrt_library(flang_rt.runtime STATIC + add_flangrt_library(flang_rt.runtime STATIC SHARED ${sources} LINK_LIBRARIES ${Backtrace_LIBRARY} INSTALL_WITH_TOOLCHAIN @@ -138,10 +138,9 @@ if (NOT WIN32) enable_cuda_compilation(flang_rt.runtime "${supported_sources}") enable_omp_offload_compilation(flang_rt.runtime "${supported_sources}") - # For unittests that depend on flang_rt. Should link to the static version - # of the library. - add_library(flang_rt.runtime.static ALIAS flang_rt.runtime) - add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime) + # Select a default runtime, which is used for unit and regression tests. + get_target_property(default_target flang_rt.runtime.default ALIASED_TARGET) + add_library(flang_rt.runtime.unittest ALIAS "${default_target}") else() # Target for building all versions of the runtime add_custom_target(flang_rt.runtime) diff --git a/flang-rt/test/CMakeLists.txt b/flang-rt/test/CMakeLists.txt index f5f7b8832d381..cb48d22d3accc 100644 --- a/flang-rt/test/CMakeLists.txt +++ b/flang-rt/test/CMakeLists.txt @@ -44,8 +44,8 @@ add_custom_target(flang-rt-test-depends) set_target_properties(flang-rt-test-depends PROPERTIES FOLDER "Flang-RT/Meta") add_dependencies(flang-rt-test-depends FlangRTUnitTests - flang_rt.runtime flang_rt.runtime.unittest + flang_rt.runtime ) add_lit_testsuite(check-flang-rt "Running the Flang-RT regression tests" diff --git a/flang-rt/test/lit.cfg.py b/flang-rt/test/lit.cfg.py index 652da31e6438f..032aeef2d5bf6 100644 --- a/flang-rt/test/lit.cfg.py +++ b/flang-rt/test/lit.cfg.py @@ -92,7 +92,7 @@ def shjoin(args, sep=" "): ("%include", os.path.join(config.flang_source_dir, "include")) ) -# Library path of libflang_rt.runtime.a (for lib search path when using non-Flang driver for linking) +# Library path of libflang_rt.runtime.a/.so (for lib search path when using non-Flang driver for linking and LD_LIBRARY_PATH) config.substitutions.append(("%libdir", config.flang_rt_output_resource_lib_dir)) # For CUDA offloading, additional steps (device linking) and libraries (cudart) are needed. diff --git a/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt b/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt index cd69a6f472873..2faacfda92a84 100644 --- a/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt +++ b/flang-rt/unittests/Runtime/CUDA/CMakeLists.txt @@ -14,5 +14,5 @@ add_flangrt_unittest(FlangCufRuntimeTests target_link_libraries(FlangCufRuntimeTests PRIVATE - flang_rt.cuda + flang_rt.cuda.default ) diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp index 3512a537d38c3..d2581e3ad0a0a 100644 --- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp @@ -84,9 +84,10 @@ class GenericLoopConversionPattern << loopOp->getName() << " operation"; }; - // For standalone directives, `bind` is already supported. Other combined - // forms will be supported in a follow-up PR. - if (combinedInfo != GenericLoopCombinedInfo::Standalone && + // For `loop` and `teams loop` directives, `bind` is supported. + // Additionally, for `teams loop`, semantic checking verifies that the + // `bind` clause modifier is `teams`, so no need to check this here again. + if (combinedInfo == GenericLoopCombinedInfo::ParallelLoop && loopOp.getBindKind()) return todo("bind"); diff --git a/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 b/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 index fa26425356dd9..0699c36c69519 100644 --- a/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 +++ b/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 @@ -1,5 +1,12 @@ -!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s +!RUN: split-file %s %t +!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/no_bind_clause.f90 -o - \ +!RUN: | FileCheck %s + +!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/bind_clause_teams.f90 -o - \ +!RUN: | FileCheck %s + +!--- no_bind_clause.f90 subroutine target_teams_loop implicit none integer :: x, i @@ -10,6 +17,17 @@ subroutine target_teams_loop end do end subroutine target_teams_loop +!--- bind_clause_teams.f90 +subroutine target_teams_loop + implicit none + integer :: x, i + + !$omp target teams loop bind(teams) + do i = 0, 10 + x = x + i + end do +end subroutine target_teams_loop + !CHECK-LABEL: func.func @_QPtarget_teams_loop !CHECK: omp.target map_entries( !CHECK-SAME: %{{.*}} -> %[[I_ARG:[^[:space:]]+]], diff --git a/flang/test/Transforms/generic-loop-rewriting-todo.mlir b/flang/test/Transforms/generic-loop-rewriting-todo.mlir index cbde981c4c49d..25baffe34e394 100644 --- a/flang/test/Transforms/generic-loop-rewriting-todo.mlir +++ b/flang/test/Transforms/generic-loop-rewriting-todo.mlir @@ -16,22 +16,6 @@ func.func @_QPparallel_loop() { return } -func.func @_QPloop_bind() { - omp.teams { - %c0 = arith.constant 0 : i32 - %c10 = arith.constant 10 : i32 - %c1 = arith.constant 1 : i32 - // expected-error@below {{not yet implemented: Unhandled clause bind in omp.loop operation}} - omp.loop bind(thread) { - omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) { - omp.yield - } - } - omp.terminator - } - return -} - omp.declare_reduction @add_reduction_i32 : i32 init { ^bb0(%arg0: i32): %c0_i32 = arith.constant 0 : i32 diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst index 94d3f1f644e5c..9f9528b30d9bf 100644 --- a/libc/docs/gpu/building.rst +++ b/libc/docs/gpu/building.rst @@ -43,7 +43,7 @@ arguments automatically. $> cd build $> cmake ../llvm -G Ninja \ -DLLVM_ENABLE_PROJECTS="clang;lld" \ - -DLLVM_ENABLE_RUNTIMES="openmp" \ + -DLLVM_ENABLE_RUNTIMES="openmp;offload" \ -DCMAKE_BUILD_TYPE= \ # Select build type -DCMAKE_INSTALL_PREFIX= \ # Where the libraries will live -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=libc \ diff --git a/libc/include/fenv.h.def b/libc/include/fenv.h.def deleted file mode 100644 index c677b2a5930dc..0000000000000 --- a/libc/include/fenv.h.def +++ /dev/null @@ -1,17 +0,0 @@ -//===-- C standard library header fenv.h ----------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_FENV_H -#define LLVM_LIBC_FENV_H - -#include "__llvm-libc-common.h" -#include "llvm-libc-macros/fenv-macros.h" - -%%public_api() - -#endif // LLVM_LIBC_FENV_H diff --git a/libc/include/fenv.yaml b/libc/include/fenv.yaml index 1ecaf63085504..c7cc7e87df37f 100644 --- a/libc/include/fenv.yaml +++ b/libc/include/fenv.yaml @@ -1,11 +1,32 @@ header: fenv.h -header_template: fenv.h.def -macros: [] +standards: + - stdc +macros: + - macro_name: FE_ALL_EXCEPT + macro_header: fenv-macros.h + - macro_name: FE_DIVBYZERO + macro_header: fenv-macros.h + - macro_name: FE_INEXACT + macro_header: fenv-macros.h + - macro_name: FE_INVALID + macro_header: fenv-macros.h + - macro_name: FE_OVERFLOW + macro_header: fenv-macros.h + - macro_name: FE_UNDERFLOW + macro_header: fenv-macros.h + - macro_name: FE_DOWNWARD + macro_header: fenv-macros.h + - macro_name: FE_TONEAREST + macro_header: fenv-macros.h + - macro_name: FE_TOWARDZERO + macro_header: fenv-macros.h + - macro_name: FE_UPWARD + macro_header: fenv-macros.h + - macro_name: FE_DFL_ENV + macro_header: fenv-macros.h types: - type_name: fenv_t - type_name: fexcept_t -enums: [] -objects: [] functions: - name: feclearexcept standards: @@ -15,14 +36,14 @@ functions: - type: int - name: fedisableexcept standards: - - GNUExtensions + - gnu return_type: int arguments: - type: int guard: null - name: feenableexcept standards: - - GNUExtensions + - gnu return_type: int arguments: - type: int @@ -35,7 +56,7 @@ functions: - type: fenv_t * - name: fegetexcept standards: - - GNUExtensions + - gnu return_type: int arguments: [] - name: fegetexceptflag diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index 323c003f1ff07..0fd3a6498b865 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x, return __gpu_shuffle_idx_u32(lane_mask, idx, x, width); } +LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) { + return __gpu_match_any_u32(lane_mask, x); +} + +LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) { + return __gpu_match_all_u32(lane_mask, x); +} + [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } LIBC_INLINE bool is_first_lane(uint64_t lane_mask) { diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt index 68bbc3849bc7e..e066830f6cc0d 100644 --- a/libc/test/integration/src/__support/GPU/CMakeLists.txt +++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt @@ -18,3 +18,12 @@ add_integration_test( LOADER_ARGS --threads 64 ) + +add_integration_test( + match_test + SUITE libc-support-gpu-tests + SRCS + match.cpp + LOADER_ARGS + --threads 64 +) diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp new file mode 100644 index 0000000000000..0eadb1364eec7 --- /dev/null +++ b/libc/test/integration/src/__support/GPU/match.cpp @@ -0,0 +1,35 @@ +//===-- Test for the shuffle operations on the GPU ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/bit.h" +#include "src/__support/GPU/utils.h" +#include "test/IntegrationTest/test.h" + +using namespace LIBC_NAMESPACE; + +// Test to ensure that match any / match all work. +static void test_match() { + uint64_t mask = gpu::get_lane_mask(); + EXPECT_EQ(1ull << gpu::get_lane_id(), + gpu::match_any(mask, gpu::get_lane_id())); + EXPECT_EQ(mask, gpu::match_any(mask, 1)); + + uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000; + EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16)); + EXPECT_EQ(mask, gpu::match_all(mask, 1)); + EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id())); +} + +TEST_MAIN(int argc, char **argv, char **envp) { + if (gpu::get_thread_id() >= gpu::get_lane_size()) + return 0; + + test_match(); + + return 0; +} diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index c88ea9700d100..5cefa8a264310 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -243,30 +243,30 @@ add_custom_command( OUTPUT convert.cl COMMAND ${Python3_EXECUTABLE} ${script_loc} > convert.cl DEPENDS ${script_loc} ) -add_custom_target( "generate_convert.cl" DEPENDS convert.cl ) -set_target_properties( "generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) +add_custom_target( generate-convert.cl DEPENDS convert.cl ) +set_target_properties( generate-convert.cl PROPERTIES FOLDER "libclc/Sourcegenning" ) add_custom_command( OUTPUT clc-convert.cl COMMAND ${Python3_EXECUTABLE} ${script_loc} --clc > clc-convert.cl DEPENDS ${script_loc} ) -add_custom_target( "clc-generate_convert.cl" DEPENDS clc-convert.cl ) -set_target_properties( "clc-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) +add_custom_target( generate-clc-convert.cl DEPENDS clc-convert.cl ) +set_target_properties( generate-clc-convert.cl PROPERTIES FOLDER "libclc/Sourcegenning" ) if ( clspv-- IN_LIST LIBCLC_TARGETS_TO_BUILD OR clspv64-- IN_LIST LIBCLC_TARGETS_TO_BUILD ) add_custom_command( OUTPUT clspv-convert.cl COMMAND ${Python3_EXECUTABLE} ${script_loc} --clspv > clspv-convert.cl DEPENDS ${script_loc} ) - add_custom_target( "clspv-generate_convert.cl" DEPENDS clspv-convert.cl ) - set_target_properties( "clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) + add_custom_target( generate-clspv-convert.cl DEPENDS clspv-convert.cl ) + set_target_properties( generate-clspv-convert.cl PROPERTIES FOLDER "libclc/Sourcegenning" ) add_custom_command( OUTPUT clc-clspv-convert.cl COMMAND ${Python3_EXECUTABLE} ${script_loc} --clc --clspv > clc-clspv-convert.cl DEPENDS ${script_loc} ) - add_custom_target( "clc-clspv-generate_convert.cl" DEPENDS clc-clspv-convert.cl ) - set_target_properties( "clc-clspv-generate_convert.cl" PROPERTIES FOLDER "libclc/Sourcegenning" ) + add_custom_target( generate-clc-clspv-convert.cl DEPENDS clc-clspv-convert.cl ) + set_target_properties( generate-clc-clspv-convert.cl PROPERTIES FOLDER "libclc/Sourcegenning" ) endif() enable_testing() @@ -324,9 +324,11 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) if( NOT ARCH STREQUAL spirv AND NOT ARCH STREQUAL spirv64 ) if( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) list( APPEND opencl_gen_files clspv-convert.cl ) - elseif ( NOT ENABLE_RUNTIME_SUBNORMAL ) + else() list( APPEND opencl_gen_files convert.cl ) - list( APPEND opencl_lib_files generic/lib/subnormal_use_default.ll ) + if ( NOT ENABLE_RUNTIME_SUBNORMAL ) + list( APPEND opencl_lib_files generic/lib/subnormal_use_default.ll ) + endif() endif() endif() diff --git a/libclc/amdgcn/lib/cl_khr_int64_extended_atomics/minmax_helpers.ll b/libclc/amdgcn/lib/cl_khr_int64_extended_atomics/minmax_helpers.ll index 98f1f54718a1f..7f12556c0abbc 100644 --- a/libclc/amdgcn/lib/cl_khr_int64_extended_atomics/minmax_helpers.ll +++ b/libclc/amdgcn/lib/cl_khr_int64_extended_atomics/minmax_helpers.ll @@ -1,9 +1,3 @@ -#if __clang_major__ >= 7 -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" -#else -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -#endif - define i64 @__clc__sync_fetch_and_min_global_8(i64 addrspace(1)* nocapture %ptr, i64 %value) nounwind alwaysinline { entry: %0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %value seq_cst diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index a3b311f12a1e3..40e31e0ba4f45 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -225,16 +225,23 @@ function(add_libclc_builtin_set) message( FATAL_ERROR "Must provide ARCH, ARCH_SUFFIX, and TRIPLE" ) endif() - set( bytecode_files "" ) + set( bytecode_files ) + set( bytecode_ir_files ) foreach( file IN LISTS ARG_GEN_FILES ARG_LIB_FILES ) # We need to take each file and produce an absolute input file, as well # as a unique architecture-specific output file. We deal with a mix of # different input files, which makes this trickier. + set( input_file_dep ) if( ${file} IN_LIST ARG_GEN_FILES ) # Generated files are given just as file names, which we must make # absolute to the binary directory. set( input_file ${CMAKE_CURRENT_BINARY_DIR}/${file} ) set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${file}.bc" ) + # If a target exists that generates this file, add that as a dependency + # of the custom command. + if( TARGET generate-${file} ) + set( input_file_dep generate-${file} ) + endif() else() # Other files are originally relative to each SOURCE file, which are # then make relative to the libclc root directory. We must normalize @@ -249,23 +256,31 @@ function(add_libclc_builtin_set) get_filename_component( file_dir ${file} DIRECTORY ) - if( ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64 ) - set(CONVERT_DEP clspv-generate_convert.cl) - else() - set(CONVERT_DEP generate_convert.cl) - endif() - compile_to_bc( TRIPLE ${ARG_TRIPLE} INPUT ${input_file} OUTPUT ${output_file} EXTRA_OPTS -fno-builtin -nostdlib "${ARG_COMPILE_FLAGS}" -I${CMAKE_CURRENT_SOURCE_DIR}/${file_dir} - DEPENDENCIES ${CONVERT_DEP} + DEPENDENCIES ${input_file_dep} ) - list( APPEND bytecode_files ${output_file} ) + + # Collect all files originating in LLVM IR separately + get_filename_component( file_ext ${file} EXT ) + if( ${file_ext} STREQUAL ".ll" ) + list( APPEND bytecode_ir_files ${output_file} ) + else() + list( APPEND bytecode_files ${output_file} ) + endif() endforeach() + # Prepend all LLVM IR files to the list so they are linked into the final + # bytecode modules first. This helps to suppress unnecessary warnings + # regarding different data layouts while linking. Any LLVM IR files without a + # data layout will (silently) be given the first data layout the linking + # process comes across. + list( PREPEND bytecode_files ${bytecode_ir_files} ) + set( builtins_comp_lib_tgt builtins.comp.${ARG_ARCH_SUFFIX} ) add_custom_target( ${builtins_comp_lib_tgt} DEPENDS ${bytecode_files} @@ -351,8 +366,9 @@ function(add_libclc_builtin_set) add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} ) set_target_properties( "prepare-${obj_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" ) - # nvptx-- targets don't include workitem builtins - if( NOT ARG_TRIPLE MATCHES ".*ptx.*--$" ) + # nvptx-- targets don't include workitem builtins, and clspv targets don't + # include all OpenCL builtins + if( NOT ARG_ARCH MATCHES "^(nvptx|clspv)(64)?$" ) add_test( NAME external-calls-${obj_suffix} COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) diff --git a/libclc/r600/lib/image/get_image_attributes_impl.ll b/libclc/r600/lib/image/get_image_attributes_impl.ll index f867ab6603591..7f1965de7602c 100644 --- a/libclc/r600/lib/image/get_image_attributes_impl.ll +++ b/libclc/r600/lib/image/get_image_attributes_impl.ll @@ -1,5 +1,3 @@ -target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - %opencl.image2d_t = type opaque %opencl.image3d_t = type opaque diff --git a/libclc/r600/lib/image/read_image_impl.ll b/libclc/r600/lib/image/read_image_impl.ll index ca2e465b4b5b8..229a2526c3743 100644 --- a/libclc/r600/lib/image/read_image_impl.ll +++ b/libclc/r600/lib/image/read_image_impl.ll @@ -1,5 +1,3 @@ -target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - %opencl.image2d_t = type opaque declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, diff --git a/libclc/r600/lib/image/write_image_impl.ll b/libclc/r600/lib/image/write_image_impl.ll index 03595ba1db737..265f5d6045e42 100644 --- a/libclc/r600/lib/image/write_image_impl.ll +++ b/libclc/r600/lib/image/write_image_impl.ll @@ -1,5 +1,3 @@ -target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" - %opencl.image2d_t = type opaque %opencl.image3d_t = type opaque diff --git a/libcxx/docs/DesignDocs/ExperimentalFeatures.rst b/libcxx/docs/DesignDocs/ExperimentalFeatures.rst index dc2ae6a25aa5d..0dbbd5f869e36 100644 --- a/libcxx/docs/DesignDocs/ExperimentalFeatures.rst +++ b/libcxx/docs/DesignDocs/ExperimentalFeatures.rst @@ -160,8 +160,8 @@ has been removed in LLVM 17.0. `Networking TS `__ ------------------------------------------- -The Networking TS is not yet part of a shipping standard. -We have not yet shipped an implementation of the Networking TS. +The Networking TS is not yet part of a shipping standard, and there is discussion around removing it. +Libc++ never shipped an implementation of the Networking TS and does not plan to do so in the future. `Ranges TS `__ --------------------------------------- diff --git a/libcxx/docs/Status/Cxx17Issues.csv b/libcxx/docs/Status/Cxx17Issues.csv index e6a232980cf7c..477f3d363a4e2 100644 --- a/libcxx/docs/Status/Cxx17Issues.csv +++ b/libcxx/docs/Status/Cxx17Issues.csv @@ -158,14 +158,14 @@ "`LWG2683 `__","filesystem::copy() says ""no effects""","2016-06 (Oulu)","|Complete|","","" "`LWG2684 `__","priority_queue lacking comparator typedef","2016-06 (Oulu)","|Complete|","","" "`LWG2685 `__","shared_ptr deleters must not throw on move construction","2016-06 (Oulu)","|Complete|","","" -"`LWG2687 `__","{inclusive,exclusive}_scan misspecified","2016-06 (Oulu)","","","" +"`LWG2687 `__","LWG2687: {inclusive,exclusive}_scan misspecified","2016-06 (Oulu)","|Complete|","","" "`LWG2688 `__","clamp misses preconditions and has extraneous condition on result","2016-06 (Oulu)","|Complete|","","" "`LWG2689 `__","Parallel versions of std::copy and std::move shouldn't be in order","2016-06 (Oulu)","|Nothing To Do|","","" "`LWG2698 `__","Effect of assign() on iterators/pointers/references","2016-06 (Oulu)","|Complete|","","" "`LWG2704 `__","recursive_directory_iterator's members should require '``*this`` is dereferenceable'","2016-06 (Oulu)","|Complete|","","" "`LWG2706 `__","Error reporting for recursive_directory_iterator::pop() is under-specified","2016-06 (Oulu)","|Complete|","","" "`LWG2707 `__","path construction and assignment should have ""string_type&&"" overloads","2016-06 (Oulu)","|Complete|","","" -"`LWG2709 `__","offsetof is unnecessarily imprecise","2016-06 (Oulu)","","","" +"`LWG2709 `__","LWG2709: offsetof is unnecessarily imprecise","2016-06 (Oulu)","|Nothing To Do|","","" "`LWG2710 `__","""Effects: Equivalent to ..."" doesn't count ""Synchronization:"" as determined semantics","2016-06 (Oulu)","|Complete|","","" "`LWG2711 `__","path is convertible from approximately everything under the sun","2016-06 (Oulu)","|Complete|","","" "`LWG2716 `__","Specification of shuffle and sample disallows lvalue URNGs","2016-06 (Oulu)","|Complete|","","" diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index ca286146840b1..1b8e76d90d9ef 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -13,7 +13,7 @@ "`LWG2966 `__","Incomplete resolution of US 74","2017-07 (Toronto)","|Nothing To Do|","","" "`LWG2974 `__","Diagnose out of bounds ``tuple_element/variant_alternative``\ ","2017-07 (Toronto)","|Complete|","","" "","","","","","" -"`LWG2779 `__","[networking.ts] Relax requirements on buffer sequence iterators","2017-11 (Albuquerque)","","","" +"`LWG2779 `__","[networking.ts] Relax requirements on buffer sequence iterators","2017-11 (Albuquerque)","|Nothing To Do|","","" "`LWG2870 `__","Default value of parameter theta of polar should be dependent","2017-11 (Albuquerque)","|Complete|","","" "`LWG2935 `__","What should create_directories do when p already exists but is not a directory?","2017-11 (Albuquerque)","|Nothing To Do|","","" "`LWG2941 `__","[thread.req.timing] wording should apply to both member and namespace-level functions","2017-11 (Albuquerque)","|Nothing To Do|","","" @@ -51,17 +51,17 @@ "`LWG2975 `__","Missing case for ``pair``\ construction in scoped and polymorphic allocators","2018-03 (Jacksonville)","","","" "`LWG2989 `__","``path``\ 's stream insertion operator lets you insert everything under the sun","2018-03 (Jacksonville)","|Complete|","","" "`LWG3000 `__","``monotonic_memory_resource::do_is_equal``\ uses ``dynamic_cast``\ unnecessarily","2018-03 (Jacksonville)","|Complete|","16","" -"`LWG3002 `__","[networking.ts] ``basic_socket_acceptor::is_open()``\ isn't ``noexcept``\ ","2018-03 (Jacksonville)","","","" +"`LWG3002 `__","[networking.ts] ``basic_socket_acceptor::is_open()``\ isn't ``noexcept``\ ","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3004 `__","|sect|\ [string.capacity] and |sect|\ [vector.capacity] should specify time complexity for ``capacity()``\ ","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3005 `__","Destruction order of arrays by ``make_shared/allocate_shared``\ only recommended?","2018-03 (Jacksonville)","","","" "`LWG3007 `__","``allocate_shared``\ should rebind allocator to *cv*-unqualified ``value_type``\ for construction","2018-03 (Jacksonville)","","","" "`LWG3009 `__","Including ````\ doesn't provide ``std::size/empty/data``\ ","2018-03 (Jacksonville)","|Complete|","","" -"`LWG3010 `__","[networking.ts] ``uses_executor``\ says ""if a type ``T::executor_type``\ exists""","2018-03 (Jacksonville)","","","" +"`LWG3010 `__","[networking.ts] ``uses_executor``\ says ""if a type ``T::executor_type``\ exists""","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3013 `__","``(recursive_)directory_iterator``\ construction and traversal should not be ``noexcept``\ ","2018-03 (Jacksonville)","|Complete|","","" "`LWG3014 `__","More ``noexcept``\ issues with filesystem operations","2018-03 (Jacksonville)","|Complete|","","" "`LWG3015 `__","``copy_options::*unspecified*``\ underspecified","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3017 `__","``list splice``\ functions should use ``addressof``\ ","2018-03 (Jacksonville)","|Complete|","","" -"`LWG3020 `__","[networking.ts] Remove spurious nested ``value_type``\ buffer sequence requirement","2018-03 (Jacksonville)","","","" +"`LWG3020 `__","[networking.ts] Remove spurious nested ``value_type``\ buffer sequence requirement","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3026 `__","``filesystem::weakly_canonical``\ still defined in terms of ``canonical(p, base)``\ ","2018-03 (Jacksonville)","|Complete|","","" "`LWG3030 `__","Who shall meet the requirements of ``try_lock``\ ?","2018-03 (Jacksonville)","|Nothing To Do|","","" "`LWG3034 `__","P0767R1 breaks previously-standard-layout types","2018-03 (Jacksonville)","|Complete|","","" @@ -238,7 +238,7 @@ "`LWG3313 `__","``join_view::iterator::operator--``\ is incorrectly constrained","2020-02 (Prague)","|Complete|","14","" "`LWG3314 `__","Is stream insertion behavior locale dependent when ``Period::type``\ is ``micro``\ ?","2020-02 (Prague)","|Complete|","16","" "`LWG3315 `__","LWG3315: Correct Allocator Default Behavior","2020-02 (Prague)","|Complete|","","" -"`LWG3316 `__","Correctly define epoch for ``utc_clock``\ / ``utc_timepoint``\ ","2020-02 (Prague)","|Nothing To Do|","","" +"`LWG3316 `__","Correctly define epoch for ``utc_clock``\ / ``utc_timepoint``\ ","2020-02 (Prague)","","","" "`LWG3317 `__","Incorrect ``operator<<``\ for floating-point durations","2020-02 (Prague)","|Complete|","16","" "`LWG3318 `__","Clarify whether clocks can represent time before their epoch","2020-02 (Prague)","","","" "`LWG3319 `__","Properly reference specification of IANA time zone database","2020-02 (Prague)","|Nothing To Do|","","" diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv index 524c6d0ac8be0..360b5520260ce 100644 --- a/libcxx/docs/Status/Cxx20Papers.csv +++ b/libcxx/docs/Status/Cxx20Papers.csv @@ -34,7 +34,7 @@ "`P0528R3 `__","The Curious Case of Padding Bits, Featuring Atomic Compare-and-Exchange","2018-06 (Rapperswil)","","","" "`P0542R5 `__","Support for contract based programming in C++","2018-06 (Rapperswil)","|Nothing To Do|","n/a","Pulled at the 2019-07 meeting in Cologne" "`P0556R3 `__","Integral power-of-2 operations","2018-06 (Rapperswil)","|Complete|","9","" -"`P0619R4 `__","Reviewing Deprecated Facilities of C++17 for C++20","2018-06 (Rapperswil)","|Complete|","20","Removed headers are still provided as an extension, but with deprecation warnings" +"`P0619R4 `__","Reviewing Deprecated Facilities of C++17 for C++20","2018-06 (Rapperswil)","|Complete|","20","Removed headers are still provided as an extension, but with deprecation warnings." "`P0646R1 `__","Improving the Return Value of Erase-Like Algorithms","2018-06 (Rapperswil)","|Complete|","10","" "`P0722R3 `__","Efficient sized delete for variable sized classes","2018-06 (Rapperswil)","|Complete|","9","" "`P0758R1 `__","Implicit conversion traits and utility functions","2018-06 (Rapperswil)","|Complete|","","" @@ -43,7 +43,7 @@ "`P0788R3 `__","Standard Library Specification in a Concepts and Contracts World","2018-06 (Rapperswil)","|Nothing To Do|","n/a","Pulled at the 2019-07 meeting in Cologne" "`P0879R0 `__","Constexpr for swap and swap related functions Also resolves LWG issue 2800.","2018-06 (Rapperswil)","|Complete|","13","" "`P0887R1 `__","The identity metafunction","2018-06 (Rapperswil)","|Complete|","8","" -"`P0892R2 `__","explicit(bool)","2018-06 (Rapperswil)","","","" +"`P0892R2 `__","P0892R2: explicit(bool)","2018-06 (Rapperswil)","|Nothing To Do|","","" "`P0898R3 `__","Standard Library Concepts","2018-06 (Rapperswil)","|Complete|","13","" "`P0935R0 `__","Eradicating unnecessarily explicit default constructors from the standard library","2018-06 (Rapperswil)","|Complete|","12","" "`P0941R2 `__","Integrating feature-test macros into the C++ WD","2018-06 (Rapperswil)","|In Progress|","","" @@ -174,7 +174,7 @@ "`P1868R2 `__","width: clarifying units of width and precision in std::format","2020-02 (Prague)","|Complete|","14","" "`P1956R1 `__","On the names of low-level bit manipulation functions","2020-02 (Prague)","|Complete|","12","" "`P1957R2 `__","Converting from ``T*``\ to bool should be considered narrowing (re: US 212)","2020-02 (Prague)","|Complete|","18","" -"`P1963R0 `__","Fixing US 313","2020-02 (Prague)","","","" +"`P1963R0 `__","P1963R0: Fixing US 313","2020-02 (Prague)","|Nothing To Do|","","" "`P1964R2 `__","Wording for boolean-testable","2020-02 (Prague)","|Complete|","13","" "`P1970R2 `__","Consistency for size() functions: Add ranges::ssize","2020-02 (Prague)","|Complete|","15","" "`P1973R1 `__","Rename ""_default_init"" Functions, Rev1","2020-02 (Prague)","|Complete|","16","The feature-test macro was not set until LLVM 20." @@ -184,7 +184,7 @@ "`P1983R0 `__","Wording for GB301, US296, US292, US291, and US283","2020-02 (Prague)","|Complete|","15","" "`P1994R1 `__","elements_view needs its own sentinel","2020-02 (Prague)","|Complete|","16","" "`P2002R1 `__","Defaulted comparison specification cleanups","2020-02 (Prague)","|Complete|","7","" -"`P2045R1 `__","Missing Mandates for the standard library","2020-02 (Prague)","","","" +"`P2045R1 `__","P2045R1: Missing Mandates for the standard library","2020-02 (Prague)","|Nothing To Do|","","" "`P2085R0 `__","Consistent defaulted comparisons","2020-02 (Prague)","","","" "`P2091R0 `__","Issues with range access CPOs","2020-02 (Prague)","|Complete|","15","" "`P2101R0 `__","P2101R0: 'Models' subsumes 'satisfies' (Wording for US298 and US300)","2020-02 (Prague)","|Nothing To Do|","","" diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv index 264c5417a5c28..bfaa63a7c224e 100644 --- a/libcxx/docs/Status/Cxx23Papers.csv +++ b/libcxx/docs/Status/Cxx23Papers.csv @@ -100,7 +100,7 @@ "`P2396R1 `__","Concurrency TS 2 fixes ","2022-11 (Kona)","","","" "`P2505R5 `__","Monadic Functions for ``std::expected``","2022-11 (Kona)","|Complete|","17","" "`P2539R4 `__","Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?","2022-11 (Kona)","|Complete|","18","" -"`P2602R2 `__","Poison Pills are Too Toxic","2022-11 (Kona)","|Complete|","19","Implemented as a DR in C++20" +"`P2602R2 `__","Poison Pills are Too Toxic","2022-11 (Kona)","|Complete|","19","Implemented as a DR in C++20." "`P2708R1 `__","No Further Fundamentals TSes","2022-11 (Kona)","|Nothing To Do|","","" "","","","","","" "`P0290R4 `__","``apply()`` for ``synchronized_value``","2023-02 (Issaquah)","","","" diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv index 65fd335a0309f..b2bb1d6e9d6c3 100644 --- a/libcxx/docs/Status/Cxx2cPapers.csv +++ b/libcxx/docs/Status/Cxx2cPapers.csv @@ -2,7 +2,7 @@ "`P2497R0 `__","Testing for success or failure of ```` functions","2023-06 (Varna)","|Complete|","18","" "`P2592R3 `__","Hashing support for ``std::chrono`` value classes","2023-06 (Varna)","","","" "`P2587R3 `__","``to_string`` or not ``to_string``","2023-06 (Varna)","","","" -"`P2562R1 `__","``constexpr`` Stable Sorting","2023-06 (Varna)","|Partial|","20.0","" +"`P2562R1 `__","``constexpr`` Stable Sorting","2023-06 (Varna)","|Partial|","20","" "`P2545R4 `__","Read-Copy Update (RCU)","2023-06 (Varna)","","","" "`P2530R3 `__","Hazard Pointers for C++26","2023-06 (Varna)","","","" "`P2538R1 `__","ADL-proof ``std::projected``","2023-06 (Varna)","|Complete|","18","" @@ -15,7 +15,7 @@ "`P1901R2 `__","Enabling the Use of ``weak_ptr`` as Keys in Unordered Associative Containers","2023-06 (Varna)","","","" "`P1885R12 `__","Naming Text Encodings to Demystify Them","2023-06 (Varna)","","","" "`P0792R14 `__","``function_ref``: a type-erased callable reference","2023-06 (Varna)","","","" -"`P2874R2 `__","Mandating Annex D Require No More","2023-06 (Varna)","","","" +"`P2874R2 `__","P2874R2: Mandating Annex D Require No More","2023-06 (Varna)","|Complete|","12","" "`P2757R3 `__","Type-checking format args","2023-06 (Varna)","","","" "`P2637R3 `__","Member ``visit``","2023-06 (Varna)","|Complete|","19","Change of ``__cpp_lib_variant`` is completed in LLVM 20. Change of ``__cpp_lib_format`` is blocked by `P2419R2 `__." "`P2641R4 `__","Checking if a ``union`` alternative is active","2023-06 (Varna)","","","" @@ -24,7 +24,7 @@ "`P1383R2 `__","More ``constexpr`` for ```` and ````","2023-06 (Varna)","","","" "`P2734R0 `__","Adding the new SI prefixes","2023-06 (Varna)","|Complete|","17","" "`P2548R6 `__","``copyable_function``","2023-06 (Varna)","","","" -"`P2714R1 `__","Bind front and back to NTTP callables","2023-06 (Varna)","|Partial|","20","``not_fn`` only" +"`P2714R1 `__","Bind front and back to NTTP callables","2023-06 (Varna)","|Partial|","20","" "`P2630R4 `__","``submdspan``","2023-06 (Varna)","","","" "","","","","","" "`P0543R3 `__","Saturation arithmetic","2023-11 (Kona)","|Complete|","18","" diff --git a/libcxx/include/__chrono/time_zone.h b/libcxx/include/__chrono/time_zone.h index ab5c22eceaaf1..d18d59d2736bf 100644 --- a/libcxx/include/__chrono/time_zone.h +++ b/libcxx/include/__chrono/time_zone.h @@ -103,10 +103,14 @@ class _LIBCPP_AVAILABILITY_TZDB time_zone { to_sys(const local_time<_Duration>& __time, choose __z) const { local_info __info = get_info(__time); switch (__info.result) { - case local_info::unique: - case local_info::nonexistent: // first and second are the same + case local_info::unique: // first and second are the same return sys_time>{__time.time_since_epoch() - __info.first.offset}; + case local_info::nonexistent: + // first and second are the same + // All non-existing values are converted to the same time. + return sys_time>{__info.first.end}; + case local_info::ambiguous: switch (__z) { case choose::earliest: diff --git a/libcxx/include/__config b/libcxx/include/__config index c16552228dbb1..53900e40655ef 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -663,7 +663,10 @@ typedef __char32_t char32_t; # if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && \ __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) || \ (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && \ - __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) + __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) || \ + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && \ + __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000) || \ + (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) # define _LIBCPP_HAS_C11_ALIGNED_ALLOC 0 # else # define _LIBCPP_HAS_C11_ALIGNED_ALLOC 1 diff --git a/libcxx/include/__mbstate_t.h b/libcxx/include/__mbstate_t.h index e013384454b41..c23ea7113ca70 100644 --- a/libcxx/include/__mbstate_t.h +++ b/libcxx/include/__mbstate_t.h @@ -43,12 +43,12 @@ # include // works on most Unixes #elif __has_include() # include // works on Darwin -#elif _LIBCPP_HAS_WIDE_CHARACTERS && __has_include_next() -# include_next // fall back to the C standard provider of mbstate_t +#elif __has_include_next() +# include_next // use the C standard provider of mbstate_t if present #elif __has_include_next() -# include_next // is also required to make mbstate_t visible +# include_next // Try in absence of for mbstate_t #else -# error "We don't know how to get the definition of mbstate_t without on your platform." +# error "We don't know how to get the definition of mbstate_t on your platform." #endif #endif // _LIBCPP___MBSTATE_T_H diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp index bad4ef352e9b9..1147c9fadf9ae 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/to_sys_choose.pass.cpp @@ -88,7 +88,7 @@ static void test_nonexistent() { // Pick an historic date where it's well known what the time zone rules were. // This makes it unlikely updates to the database change these rules. std::chrono::local_time time{ - (std::chrono::sys_days{std::chrono::March / 30 / 1986} + 2h + 30min).time_since_epoch()}; + (std::chrono::sys_days{std::chrono::March / 30 / 1986} + 2h).time_since_epoch()}; std::chrono::sys_seconds expected{time.time_since_epoch() - 1h}; @@ -100,6 +100,13 @@ static void test_nonexistent() { assert(tz->to_sys(time + 0us, std::chrono::choose::latest) == expected); assert(tz->to_sys(time + 0ms, std::chrono::choose::earliest) == expected); assert(tz->to_sys(time + 0s, std::chrono::choose::latest) == expected); + + // The entire nonexisting hour should map to the same time. + // For nonexistant the value of std::chrono::choose has no effect. + assert(tz->to_sys(time + 1s, std::chrono::choose::earliest) == expected); + assert(tz->to_sys(time + 1min, std::chrono::choose::latest) == expected); + assert(tz->to_sys(time + 30min, std::chrono::choose::earliest) == expected); + assert(tz->to_sys(time + 59min + 59s, std::chrono::choose::latest) == expected); } // Tests ambiguous conversions. @@ -120,7 +127,7 @@ static void test_ambiguous() { // Pick an historic date where it's well known what the time zone rules were. // This makes it unlikely updates to the database change these rules. std::chrono::local_time time{ - (std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h + 30min).time_since_epoch()}; + (std::chrono::sys_days{std::chrono::September / 28 / 1986} + 2h).time_since_epoch()}; std::chrono::sys_seconds earlier{time.time_since_epoch() - 2h}; std::chrono::sys_seconds later{time.time_since_epoch() - 1h}; @@ -133,6 +140,12 @@ static void test_ambiguous() { assert(tz->to_sys(time + 0us, std::chrono::choose::latest) == later); assert(tz->to_sys(time + 0ms, std::chrono::choose::earliest) == earlier); assert(tz->to_sys(time + 0s, std::chrono::choose::latest) == later); + + // Test times in the ambigious hour + assert(tz->to_sys(time + 1s, std::chrono::choose::earliest) == earlier + 1s); + assert(tz->to_sys(time + 1min, std::chrono::choose::latest) == later + 1min); + assert(tz->to_sys(time + 30min, std::chrono::choose::earliest) == earlier + 30min); + assert(tz->to_sys(time + 59min + 59s, std::chrono::choose::latest) == later + 59min + 59s); } // This test does the basic validations of this function. The library function diff --git a/libunwind/src/Unwind-wasm.c b/libunwind/src/Unwind-wasm.c index b18b32c5d1784..b8b7bc2779f17 100644 --- a/libunwind/src/Unwind-wasm.c +++ b/libunwind/src/Unwind-wasm.c @@ -102,8 +102,7 @@ _LIBUNWIND_EXPORT uintptr_t _Unwind_GetIP(struct _Unwind_Context *context) { } /// Not used in Wasm. -_LIBUNWIND_EXPORT void _Unwind_SetIP(struct _Unwind_Context *context, - uintptr_t value) {} +_LIBUNWIND_EXPORT void _Unwind_SetIP(struct _Unwind_Context *, uintptr_t) {} /// Called by personality handler to get LSDA for current frame. _LIBUNWIND_EXPORT uintptr_t @@ -115,8 +114,7 @@ _Unwind_GetLanguageSpecificData(struct _Unwind_Context *context) { } /// Not used in Wasm. -_LIBUNWIND_EXPORT uintptr_t -_Unwind_GetRegionStart(struct _Unwind_Context *context) { +_LIBUNWIND_EXPORT uintptr_t _Unwind_GetRegionStart(struct _Unwind_Context *) { return 0; } diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index a01c69c709876..3494d1ba0ac02 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -1070,16 +1070,20 @@ void MergeChunk::writeTo(uint8_t *buf) const { } // MinGW specific. -size_t AbsolutePointerChunk::getSize() const { return ctx.config.wordsize; } +size_t AbsolutePointerChunk::getSize() const { + return symtab.ctx.config.wordsize; +} void AbsolutePointerChunk::writeTo(uint8_t *buf) const { - if (ctx.config.is64()) { + if (symtab.ctx.config.is64()) { write64le(buf, value); } else { write32le(buf, value); } } +MachineTypes AbsolutePointerChunk::getMachine() const { return symtab.machine; } + void ECExportThunkChunk::writeTo(uint8_t *buf) const { memcpy(buf, ECExportThunkCode, sizeof(ECExportThunkCode)); write32le(buf + 10, target->getRVA() - rva - 14); diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h index d6216efdd90bd..06e9aae0e6f6e 100644 --- a/lld/COFF/Chunks.h +++ b/lld/COFF/Chunks.h @@ -910,16 +910,17 @@ class PseudoRelocTableChunk : public NonSectionChunk { // MinGW specific. A Chunk that contains one pointer-sized absolute value. class AbsolutePointerChunk : public NonSectionChunk { public: - AbsolutePointerChunk(COFFLinkerContext &ctx, uint64_t value) - : value(value), ctx(ctx) { + AbsolutePointerChunk(SymbolTable &symtab, uint64_t value) + : value(value), symtab(symtab) { setAlignment(getSize()); } size_t getSize() const override; void writeTo(uint8_t *buf) const override; + MachineTypes getMachine() const override; private: uint64_t value; - COFFLinkerContext &ctx; + SymbolTable &symtab; }; // Return true if this file has the hotpatch flag set to true in the S_COMPILE3 diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 678de915b6cdb..58727c1615769 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -403,6 +403,12 @@ void OutputSection::addContributingPartialSection(PartialSection *sec) { contribSections.push_back(sec); } +void OutputSection::splitECChunks() { + llvm::stable_sort(chunks, [=](const Chunk *a, const Chunk *b) { + return (a->getMachine() != ARM64) < (b->getMachine() != ARM64); + }); +} + // Check whether the target address S is in range from a relocation // of type relType at address P. bool Writer::isInRange(uint16_t relType, uint64_t s, uint64_t p, int margin, @@ -1156,6 +1162,11 @@ void Writer::createSections() { sec->addContributingPartialSection(pSec); } + if (ctx.hybridSymtab) { + if (OutputSection *sec = findSection(".CRT")) + sec->splitECChunks(); + } + // Finally, move some output sections to the end. auto sectionOrder = [&](const OutputSection *s) { // Move DISCARDABLE (or non-memory-mapped) sections to the end of file @@ -2324,21 +2335,28 @@ void Writer::createRuntimePseudoRelocs() { // There's a symbol pointing to the start sentinel pointer, __CTOR_LIST__ // and __DTOR_LIST__ respectively. void Writer::insertCtorDtorSymbols() { - AbsolutePointerChunk *ctorListHead = make(ctx, -1); - AbsolutePointerChunk *ctorListEnd = make(ctx, 0); - AbsolutePointerChunk *dtorListHead = make(ctx, -1); - AbsolutePointerChunk *dtorListEnd = make(ctx, 0); - ctorsSec->insertChunkAtStart(ctorListHead); - ctorsSec->addChunk(ctorListEnd); - dtorsSec->insertChunkAtStart(dtorListHead); - dtorsSec->addChunk(dtorListEnd); - - Symbol *ctorListSym = ctx.symtab.findUnderscore("__CTOR_LIST__"); - Symbol *dtorListSym = ctx.symtab.findUnderscore("__DTOR_LIST__"); - replaceSymbol(ctorListSym, ctorListSym->getName(), - ctorListHead); - replaceSymbol(dtorListSym, dtorListSym->getName(), - dtorListHead); + ctx.forEachSymtab([&](SymbolTable &symtab) { + AbsolutePointerChunk *ctorListHead = make(symtab, -1); + AbsolutePointerChunk *ctorListEnd = make(symtab, 0); + AbsolutePointerChunk *dtorListHead = make(symtab, -1); + AbsolutePointerChunk *dtorListEnd = make(symtab, 0); + ctorsSec->insertChunkAtStart(ctorListHead); + ctorsSec->addChunk(ctorListEnd); + dtorsSec->insertChunkAtStart(dtorListHead); + dtorsSec->addChunk(dtorListEnd); + + Symbol *ctorListSym = symtab.findUnderscore("__CTOR_LIST__"); + Symbol *dtorListSym = symtab.findUnderscore("__DTOR_LIST__"); + replaceSymbol(ctorListSym, ctorListSym->getName(), + ctorListHead); + replaceSymbol(dtorListSym, dtorListSym->getName(), + dtorListHead); + }); + + if (ctx.hybridSymtab) { + ctorsSec->splitECChunks(); + dtorsSec->splitECChunks(); + } } // Handles /section options to allow users to overwrite diff --git a/lld/COFF/Writer.h b/lld/COFF/Writer.h index 9004bb310d073..7e458b766bae8 100644 --- a/lld/COFF/Writer.h +++ b/lld/COFF/Writer.h @@ -50,6 +50,9 @@ class OutputSection { void writeHeaderTo(uint8_t *buf, bool isDebug); void addContributingPartialSection(PartialSection *sec); + // Sort chunks to split native and EC sections on hybrid targets. + void splitECChunks(); + // Returns the size of this section in an executable memory image. // This may be smaller than the raw size (the raw size is multiple // of disk sector size, so there may be padding at end), or may be diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index 7d2953ddf64f0..e667fdc0633c5 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -663,12 +663,12 @@ void ARM::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { case R_ARM_THM_JUMP8: // We do a 9 bit check because val is right-shifted by 1 bit. checkInt(ctx, loc, val, 9, rel); - write16(ctx, loc, (read32(ctx, loc) & 0xff00) | ((val >> 1) & 0x00ff)); + write16(ctx, loc, (read16(ctx, loc) & 0xff00) | ((val >> 1) & 0x00ff)); break; case R_ARM_THM_JUMP11: // We do a 12 bit check because val is right-shifted by 1 bit. checkInt(ctx, loc, val, 12, rel); - write16(ctx, loc, (read32(ctx, loc) & 0xf800) | ((val >> 1) & 0x07ff)); + write16(ctx, loc, (read16(ctx, loc) & 0xf800) | ((val >> 1) & 0x07ff)); break; case R_ARM_THM_JUMP19: // Encoding T3: Val = S:J2:J1:imm6:imm11:0 diff --git a/lld/test/COFF/arm64x-crt-sec.s b/lld/test/COFF/arm64x-crt-sec.s new file mode 100644 index 0000000000000..5be70a1845f12 --- /dev/null +++ b/lld/test/COFF/arm64x-crt-sec.s @@ -0,0 +1,42 @@ +// REQUIRES: aarch64, x86 +// RUN: split-file %s %t.dir && cd %t.dir + +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows crt1-arm64.s -o crt1-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows crt2-arm64.s -o crt2-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows crt1-arm64ec.s -o crt1-arm64ec.obj +// RUN: llvm-mc -filetype=obj -triple=x86_64-windows crt2-amd64.s -o crt2-amd64.obj + +// Check that .CRT chunks are correctly sorted and that EC and native chunks are split. + +// RUN: lld-link -out:out.dll -machine:arm64x -dll -noentry crt1-arm64.obj crt2-arm64.obj crt1-arm64ec.obj crt2-amd64.obj +// RUN: llvm-readobj --hex-dump=.CRT out.dll | FileCheck %s + +// RUN: lld-link -out:out2.dll -machine:arm64x -dll -noentry crt1-arm64.obj crt1-arm64ec.obj crt2-arm64.obj crt2-amd64.obj +// RUN: llvm-readobj --hex-dump=.CRT out2.dll | FileCheck %s + +// RUN: lld-link -out:out3.dll -machine:arm64x -dll -noentry crt2-amd64.obj crt1-arm64ec.obj crt2-arm64.obj crt1-arm64.obj +// RUN: llvm-readobj --hex-dump=.CRT out3.dll | FileCheck %s + +// CHECK: 0x180002000 01000000 00000000 02000000 00000000 +// CHECK-NEXT: 0x180002010 03000000 00000000 11000000 00000000 +// CHECK-NEXT: 0x180002020 12000000 00000000 13000000 00000000 + +#--- crt1-arm64.s + .section .CRT$A,"dr" + .xword 1 + .section .CRT$Z,"dr" + .xword 3 + +#--- crt2-arm64.s + .section .CRT$B,"dr" + .xword 2 + +#--- crt1-arm64ec.s + .section .CRT$A,"dr" + .xword 0x11 + .section .CRT$Z,"dr" + .xword 0x13 + +#--- crt2-amd64.s + .section .CRT$B,"dr" + .quad 0x12 diff --git a/lld/test/COFF/arm64x-ctors-sec.s b/lld/test/COFF/arm64x-ctors-sec.s new file mode 100644 index 0000000000000..283d5f045260d --- /dev/null +++ b/lld/test/COFF/arm64x-ctors-sec.s @@ -0,0 +1,76 @@ +// REQUIRES: aarch64, x86 +// RUN: split-file %s %t.dir && cd %t.dir + +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows ctor1-arm64.s -o ctor1-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows ctor2-arm64.s -o ctor2-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ctor1-arm64ec.s -o ctor1-arm64ec.obj +// RUN: llvm-mc -filetype=obj -triple=x86_64-windows ctor2-amd64.s -o ctor2-amd64.obj +// RUN: llvm-mc -filetype=obj -triple=aarch64-windows test.s -o test-arm64.obj +// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test-arm64ec.obj + +// Check that .ctors and .dtors chunks are correctly sorted and that EC and native chunks are split. + +// RUN: lld-link -out:out.dll -machine:arm64x -lldmingw -dll -noentry test-arm64.obj test-arm64ec.obj \ +// RUN: ctor1-arm64.obj ctor2-arm64.obj ctor1-arm64ec.obj ctor2-amd64.obj +// RUN: llvm-readobj --hex-dump=.rdata --hex-dump=.test out.dll | FileCheck %s + +// RUN: lld-link -out:out2.dll -machine:arm64x -lldmingw -dll -noentry test-arm64.obj test-arm64ec.obj \ +// RUN: ctor1-arm64ec.obj ctor2-amd64.obj ctor1-arm64.obj ctor2-arm64.obj +// RUN: llvm-readobj --hex-dump=.rdata --hex-dump=.test out2.dll | FileCheck %s + +// RUN: lld-link -out:out3.dll -machine:arm64x -lldmingw -dll -noentry test-arm64.obj test-arm64ec.obj \ +// RUN: ctor2-arm64.obj ctor1-arm64ec.obj ctor2-amd64.obj ctor1-arm64.obj +// RUN: llvm-readobj --hex-dump=.rdata --hex-dump=.test out3.dll | FileCheck %s + +// CHECK: Hex dump of section '.rdata': +// CHECK-NEXT: 0x180001000 ffffffff ffffffff 01000000 00000000 +// CHECK-NEXT: 0x180001010 02000000 00000000 03000000 00000000 +// CHECK-NEXT: 0x180001020 00000000 00000000 ffffffff ffffffff +// CHECK-NEXT: 0x180001030 11000000 00000000 12000000 00000000 +// CHECK-NEXT: 0x180001040 13000000 00000000 00000000 00000000 +// CHECK-NEXT: 0x180001050 ffffffff ffffffff 01010000 00000000 +// CHECK-NEXT: 0x180001060 02010000 00000000 03010000 00000000 +// CHECK-NEXT: 0x180001070 00000000 00000000 ffffffff ffffffff +// CHECK-NEXT: 0x180001080 11010000 00000000 12010000 00000000 +// CHECK-NEXT: 0x180001090 13010000 00000000 00000000 00000000 +// CHECK-EMPTY: +// CHECK-NEXT: Hex dump of section '.test': +// CHECK-NEXT: 0x180003000 00100000 50100000 28100000 78100000 + +#--- ctor1-arm64.s + .section .ctors.1,"drw" + .xword 1 + .section .ctors.3,"drw" + .xword 3 + .section .dtors.1,"drw" + .xword 0x101 + .section .dtors.3,"drw" + .xword 0x103 + +#--- ctor2-arm64.s + .section .ctors.2,"drw" + .xword 2 + .section .dtors.2,"drw" + .xword 0x102 + +#--- ctor1-arm64ec.s + .section .ctors.1,"drw" + .xword 0x11 + .section .ctors.3,"drw" + .xword 0x13 + .section .dtors.1,"drw" + .xword 0x111 + .section .dtors.3,"drw" + .xword 0x113 + +#--- ctor2-amd64.s + .section .ctors.2,"drw" + .quad 0x12 + .section .dtors.2,"drw" + .quad 0x112 + +#--- test.s + .section .test + .rva __CTOR_LIST__ + .rva __DTOR_LIST__ + diff --git a/lld/test/ELF/arm-thumb-jump8-11.s b/lld/test/ELF/arm-thumb-jump8-11.s new file mode 100644 index 0000000000000..ed54f3c0cc945 --- /dev/null +++ b/lld/test/ELF/arm-thumb-jump8-11.s @@ -0,0 +1,32 @@ +# REQUIRES: arm + +# RUN: llvm-mc -triple thumbv6m-arm-eabi --filetype=obj %s -o %t.o +# RUN: ld.lld %t.o -o %t +# RUN: llvm-objdump -d %t --no-show-raw-insn | FileCheck %s --check-prefixes=CHECK,CHECK-LE + +# RUN: llvm-mc -triple thumbebv6m-arm-eabi --filetype=obj %s -o %t.o +# RUN: ld.lld %t.o -o %t +# RUN: llvm-objdump -d %t --no-show-raw-insn | FileCheck %s --check-prefixes=CHECK,CHECK-BE + +# CHECK-LE: file format elf32-littlearm +# CHECK-BE: file format elf32-bigarm + +# CHECK: Disassembly of section .text: + +# CHECK-LABEL: [[#%x,TARGET:]] : +# CHECK-NEXT: [[#TARGET]]: bx lr + +# CHECK-LABEL: <_start>: +# CHECK-NEXT: b 0x[[#TARGET]] +# CHECK-NEXT: beq 0x[[#TARGET]] + + .thumb + .section .text.1, "ax", %progbits +target: + bx lr + + .section .text.2, "ax", %progbits + .globl _start +_start: + b.n target // R_ARM_THM_JUMP11 + beq.n target // R_ARM_THM_JUMP8 diff --git a/lldb/include/lldb/DataFormatters/TypeSynthetic.h b/lldb/include/lldb/DataFormatters/TypeSynthetic.h index bf6dc6a0c3c6b..14e516964f250 100644 --- a/lldb/include/lldb/DataFormatters/TypeSynthetic.h +++ b/lldb/include/lldb/DataFormatters/TypeSynthetic.h @@ -68,7 +68,7 @@ class SyntheticChildrenFrontEnd { // a false return value from this call if it returns true, then // CalculateNumChildren() can return any number >= 0 (0 being valid) it // should if at all possible be more efficient than CalculateNumChildren() - virtual bool MightHaveChildren() = 0; + virtual bool MightHaveChildren() { return true; } // if this function returns a non-null ValueObject, then the returned // ValueObject will stand for this ValueObject whenever a "value" request is diff --git a/lldb/include/lldb/DataFormatters/VectorIterator.h b/lldb/include/lldb/DataFormatters/VectorIterator.h index 70bcf50ca1b1d..d095f085cabab 100644 --- a/lldb/include/lldb/DataFormatters/VectorIterator.h +++ b/lldb/include/lldb/DataFormatters/VectorIterator.h @@ -30,8 +30,6 @@ class VectorIteratorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h index 096a8f1ab68e8..fe4fcbccee370 100644 --- a/lldb/include/lldb/Symbol/CompilerType.h +++ b/lldb/include/lldb/Symbol/CompilerType.h @@ -14,6 +14,7 @@ #include #include +#include "lldb/Utility/Scalar.h" #include "lldb/lldb-private.h" #include "llvm/ADT/APSInt.h" #include "llvm/Support/Casting.h" @@ -544,7 +545,7 @@ bool operator==(const CompilerType &lhs, const CompilerType &rhs); bool operator!=(const CompilerType &lhs, const CompilerType &rhs); struct CompilerType::IntegralTemplateArgument { - llvm::APSInt value; + Scalar value; CompilerType type; }; diff --git a/lldb/include/lldb/Target/LanguageRuntime.h b/lldb/include/lldb/Target/LanguageRuntime.h index f9ae2dc589632..7e4c11df0da7f 100644 --- a/lldb/include/lldb/Target/LanguageRuntime.h +++ b/lldb/include/lldb/Target/LanguageRuntime.h @@ -201,6 +201,8 @@ class LanguageRuntime : public Runtime, public PluginInterface { return false; } + virtual bool IsSymbolARuntimeThunk(const Symbol &symbol) { return false; } + // Given the name of a runtime symbol (e.g. in Objective-C, an ivar offset // symbol), try to determine from the runtime what the value of that symbol // would be. Useful when the underlying binary is stripped. diff --git a/lldb/source/API/SBType.cpp b/lldb/source/API/SBType.cpp index 6401d32c85795..9eb1f0c75ea05 100644 --- a/lldb/source/API/SBType.cpp +++ b/lldb/source/API/SBType.cpp @@ -697,6 +697,7 @@ lldb::SBValue SBType::GetTemplateArgumentValue(lldb::SBTarget target, std::optional arg; const bool expand_pack = true; switch (GetTemplateArgumentKind(idx)) { + case eTemplateArgumentKindStructuralValue: case eTemplateArgumentKindIntegral: arg = m_opaque_sp->GetCompilerType(false).GetIntegralTemplateArgument( idx, expand_pack); @@ -708,9 +709,8 @@ lldb::SBValue SBType::GetTemplateArgumentValue(lldb::SBTarget target, if (!arg) return {}; - Scalar value{arg->value}; DataExtractor data; - value.GetData(data); + arg->value.GetData(data); ExecutionContext exe_ctx; auto target_sp = target.GetSP(); diff --git a/lldb/source/DataFormatters/VectorType.cpp b/lldb/source/DataFormatters/VectorType.cpp index cba107b7da890..fa3fb1b674efb 100644 --- a/lldb/source/DataFormatters/VectorType.cpp +++ b/lldb/source/DataFormatters/VectorType.cpp @@ -268,8 +268,6 @@ class VectorTypeSyntheticFrontEnd : public SyntheticChildrenFrontEnd { return lldb::ChildCacheState::eRefetch; } - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { const char *item_name = name.GetCString(); uint32_t idx = ExtractIndexFromString(item_name); diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp index d7d4654a6b5f4..6a22501c98aab 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp @@ -144,9 +144,6 @@ class BlockPointerSyntheticFrontEnd : public SyntheticChildrenFrontEnd { return lldb::ChildCacheState::eRefetch; } - // maybe return false if the block pointer is, say, null - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { if (!m_block_struct_type.IsValid()) return UINT32_MAX; diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp index 5e63d1d7b2145..76a10d2393782 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp @@ -199,11 +199,6 @@ lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t StdlibCoroutineHandleSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { if (!m_resume_ptr_sp || !m_destroy_ptr_sp) diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h index f9765f3255d2b..c33c82bd2fc45 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h +++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h @@ -40,8 +40,6 @@ class StdlibCoroutineHandleSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp index 33955dccb6ccc..934b456884ac0 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/GenericBitset.cpp @@ -32,7 +32,6 @@ class GenericBitsetFrontEnd : public SyntheticChildrenFrontEnd { return formatters::ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; llvm::Expected CalculateNumChildren() override { return m_elements.size(); @@ -91,7 +90,7 @@ lldb::ChildCacheState GenericBitsetFrontEnd::Update() { size_t size = 0; if (auto arg = m_backend.GetCompilerType().GetIntegralTemplateArgument(0)) - size = arg->value.getLimitedValue(); + size = arg->value.GetAPSInt().getLimitedValue(); m_elements.assign(size, ValueObjectSP()); m_first = diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp index a8a7c16de5e86..b224d3e859c84 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/GenericOptional.cpp @@ -42,7 +42,6 @@ class GenericOptionalFrontend : public SyntheticChildrenFrontEnd { return formatters::ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } llvm::Expected CalculateNumChildren() override { return m_has_value ? 1U : 0U; } diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp index 2aa8fdba70634..98e787dacc505 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp @@ -309,11 +309,6 @@ lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (name == "__ptr_") @@ -412,11 +407,6 @@ lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (name == "pointer") diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h index cb9ceaf093300..21fbb361eb934 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h @@ -102,8 +102,6 @@ class LibcxxSharedPtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; ~LibcxxSharedPtrSyntheticFrontEnd() override; @@ -122,8 +120,6 @@ class LibcxxUniquePtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; ~LibcxxUniquePtrSyntheticFrontEnd() override; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp index 7f30dc186291e..3104f33ee80b3 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxAtomic.cpp @@ -96,8 +96,6 @@ class LibcxxStdAtomicSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -119,11 +117,6 @@ lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdAtomicSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - llvm::Expected lldb_private::formatters:: LibcxxStdAtomicSyntheticFrontEnd::CalculateNumChildren() { return m_real_child ? 1 : 0; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp index 67c6d1d3e5506..cd13455a2e460 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxInitializerList.cpp @@ -32,8 +32,6 @@ class LibcxxInitializerListSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -99,11 +97,6 @@ lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxInitializerListSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp index f33b148249ab9..ae1ad2bfe7200 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp @@ -109,7 +109,6 @@ class AbstractListFrontEnd : public SyntheticChildrenFrontEnd { size_t GetIndexOfChildWithName(ConstString name) override { return ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; protected: diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp index ebaf60a16b069..d75f25f49fdb4 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp @@ -197,8 +197,6 @@ class LibcxxStdMapSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -237,8 +235,6 @@ class LibCxxMapIteratorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; ~LibCxxMapIteratorSyntheticFrontEnd() override = default; @@ -397,11 +393,6 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { return ExtractIndexFromString(name.GetCString()); @@ -497,11 +488,6 @@ lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd::GetChildAtIndex( return m_pair_sp->GetChildAtIndex(idx); } -bool lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibCxxMapIteratorSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_pair_sp) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxProxyArray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxProxyArray.cpp index c659adbb9ab2e..fdb8f07ec4006 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxProxyArray.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxProxyArray.cpp @@ -41,8 +41,6 @@ class LibcxxStdProxyArraySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -173,11 +171,6 @@ lldb_private::formatters::LibcxxStdProxyArraySyntheticFrontEnd::Update() { return ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdProxyArraySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdProxyArraySyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_base) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp index 5b459a17fe29b..8f1e35b3bede9 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxQueue.cpp @@ -25,7 +25,6 @@ class QueueFrontEnd : public SyntheticChildrenFrontEnd { : UINT32_MAX; } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; llvm::Expected CalculateNumChildren() override { diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp index f3fe56525789a..e8ab37a022fbc 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxRangesRefView.cpp @@ -40,8 +40,6 @@ class LibcxxStdRangesRefViewSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { // We only have a single child return 0; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp index 5d607709d2c6f..523a7ab1001ec 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSliceArray.cpp @@ -62,8 +62,6 @@ class LibcxxStdSliceArraySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -145,11 +143,6 @@ lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd::Update() { return ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdSliceArraySyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp index 15040295efe6d..21ee83041c065 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxSpan.cpp @@ -55,8 +55,6 @@ class LibcxxStdSpanSyntheticFrontEnd : public SyntheticChildrenFrontEnd { // from the only other place it can be: the template argument. lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -119,18 +117,13 @@ lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd::Update() { } else if (auto arg = m_backend.GetCompilerType().GetIntegralTemplateArgument(1)) { - m_num_elements = arg->value.getLimitedValue(); + m_num_elements = arg->value.GetAPSInt().getLimitedValue(); } } return lldb::ChildCacheState::eReuse; } -bool lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdSpanSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp index 3e3259ab428df..263ca8349b891 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxTuple.cpp @@ -24,7 +24,6 @@ class TupleFrontEnd: public SyntheticChildrenFrontEnd { return formatters::ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; llvm::Expected CalculateNumChildren() override { return m_elements.size(); diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp index be520ee27af06..395ecc489a17e 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp @@ -40,8 +40,6 @@ class LibcxxStdUnorderedMapSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -70,8 +68,6 @@ class LibCxxUnorderedMapIteratorSyntheticFrontEnd lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -295,11 +291,6 @@ lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { return ExtractIndexFromString(name.GetCString()); @@ -407,11 +398,6 @@ lldb::ValueObjectSP lldb_private::formatters:: return lldb::ValueObjectSP(); } -bool lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibCxxUnorderedMapIteratorSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (name == "first") diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp index 3f519f8c585f5..18c9c9b0e8710 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxValarray.cpp @@ -30,8 +30,6 @@ class LibcxxStdValarraySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -124,11 +122,6 @@ lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd::Update() { return ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdValarraySyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start || !m_finish) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp index 62794318e0777..c3cb1fdcb4251 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVariant.cpp @@ -203,7 +203,6 @@ class VariantFrontEnd : public SyntheticChildrenFrontEnd { return formatters::ExtractIndexFromString(name.GetCString()); } - bool MightHaveChildren() override { return true; } lldb::ChildCacheState Update() override; llvm::Expected CalculateNumChildren() override { return m_size; } ValueObjectSP GetChildAtIndex(uint32_t idx) override; diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp index b762379a07d3a..ae3ed6326b45f 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp @@ -33,8 +33,6 @@ class LibcxxStdVectorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -54,8 +52,6 @@ class LibcxxVectorBoolSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -153,11 +149,6 @@ lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd:: GetIndexOfChildWithName(ConstString name) { if (!m_start || !m_finish) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp index 0a1877471916d..127c0cd6666a8 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp @@ -49,8 +49,6 @@ class LibstdcppMapIteratorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -70,8 +68,6 @@ class LibStdcppSharedPtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -149,8 +145,6 @@ LibstdcppMapIteratorSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { return lldb::ValueObjectSP(); } -bool LibstdcppMapIteratorSyntheticFrontEnd::MightHaveChildren() { return true; } - size_t LibstdcppMapIteratorSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { if (name == "first") @@ -232,8 +226,6 @@ VectorIteratorSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { return lldb::ValueObjectSP(); } -bool VectorIteratorSyntheticFrontEnd::MightHaveChildren() { return true; } - size_t VectorIteratorSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { if (name == "item") @@ -416,8 +408,6 @@ lldb::ChildCacheState LibStdcppSharedPtrSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool LibStdcppSharedPtrSyntheticFrontEnd::MightHaveChildren() { return true; } - size_t LibStdcppSharedPtrSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { if (name == "pointer") diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp index f59969d4cd7a1..68133b202a0c8 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppTuple.cpp @@ -32,8 +32,6 @@ class LibStdcppTupleSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -86,8 +84,6 @@ lldb::ChildCacheState LibStdcppTupleSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool LibStdcppTupleSyntheticFrontEnd::MightHaveChildren() { return true; } - lldb::ValueObjectSP LibStdcppTupleSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { if (idx < m_members.size() && m_members[idx]) diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp index 9447f7463f64a..209aaced23c7d 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcppUniquePointer.cpp @@ -32,8 +32,6 @@ class LibStdcppUniquePtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; bool GetSummary(Stream &stream, const TypeSummaryOptions &options); @@ -113,8 +111,6 @@ lldb::ChildCacheState LibStdcppUniquePtrSyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool LibStdcppUniquePtrSyntheticFrontEnd::MightHaveChildren() { return true; } - lldb::ValueObjectSP LibStdcppUniquePtrSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { if (idx == 0 && m_ptr_obj) diff --git a/lldb/source/Plugins/Language/ObjC/NSArray.cpp b/lldb/source/Plugins/Language/ObjC/NSArray.cpp index 072b8b5a6c860..7054dd8ffa952 100644 --- a/lldb/source/Plugins/Language/ObjC/NSArray.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSArray.cpp @@ -56,8 +56,6 @@ class NSArrayMSyntheticFrontEndBase : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override = 0; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; protected: @@ -220,8 +218,6 @@ class GenericNSArrayISyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -325,8 +321,6 @@ class NSArray1SyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; }; } // namespace formatters @@ -532,11 +526,6 @@ lldb_private::formatters::GenericNSArrayMSyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -bool -lldb_private::formatters::NSArrayMSyntheticFrontEndBase::MightHaveChildren() { - return true; -} - size_t lldb_private::formatters::NSArrayMSyntheticFrontEndBase::GetIndexOfChildWithName( ConstString name) { @@ -674,13 +663,6 @@ lldb_private::formatters::GenericNSArrayISyntheticFrontEnd -bool -lldb_private::formatters::GenericNSArrayISyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - template lldb::ValueObjectSP lldb_private::formatters::GenericNSArrayISyntheticFrontEnd:: @@ -764,10 +746,6 @@ lldb_private::formatters::NSArray1SyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSArray1SyntheticFrontEnd::MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSArray1SyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { diff --git a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp index cf8750fd4976e..008e8eb569f01 100644 --- a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp @@ -109,8 +109,6 @@ class NSDictionaryISyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -150,8 +148,6 @@ class NSConstantDictionarySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -182,8 +178,6 @@ class NSCFDictionarySyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -215,8 +209,6 @@ class NSDictionary1SyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -236,8 +228,6 @@ class GenericNSDictionaryMSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -269,8 +259,6 @@ namespace Foundation1100 { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -648,11 +636,6 @@ lldb_private::formatters::NSDictionaryISyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSDictionaryISyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSDictionaryISyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -770,11 +753,6 @@ lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSCFDictionarySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSCFDictionarySyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -914,11 +892,6 @@ lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSConstantDictionarySyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters:: NSConstantDictionarySyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { uint32_t num_children = CalculateNumChildrenIgnoringErrors(); @@ -1005,11 +978,6 @@ lldb_private::formatters::NSDictionary1SyntheticFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSDictionary1SyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSDictionary1SyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -1131,13 +1099,6 @@ lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd -bool -lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd:: - MightHaveChildren() { - return true; -} - template lldb::ValueObjectSP lldb_private::formatters::GenericNSDictionaryMSyntheticFrontEnd< @@ -1292,12 +1253,6 @@ lldb::ChildCacheState lldb_private::formatters::Foundation1100:: : lldb::ChildCacheState::eRefetch; } -bool -lldb_private::formatters::Foundation1100:: - NSDictionaryMSyntheticFrontEnd::MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::Foundation1100:: NSDictionaryMSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { diff --git a/lldb/source/Plugins/Language/ObjC/NSError.cpp b/lldb/source/Plugins/Language/ObjC/NSError.cpp index bb54044ae1d61..5557daa2bf1b2 100644 --- a/lldb/source/Plugins/Language/ObjC/NSError.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSError.cpp @@ -165,8 +165,6 @@ class NSErrorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { return lldb::ChildCacheState::eRefetch; } - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { static ConstString g_userInfo("_userInfo"); if (name == g_userInfo) diff --git a/lldb/source/Plugins/Language/ObjC/NSException.cpp b/lldb/source/Plugins/Language/ObjC/NSException.cpp index b7d42bc5745e3..67f3f1779e147 100644 --- a/lldb/source/Plugins/Language/ObjC/NSException.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSException.cpp @@ -148,8 +148,6 @@ class NSExceptionSyntheticFrontEnd : public SyntheticChildrenFrontEnd { : lldb::ChildCacheState::eRefetch; } - bool MightHaveChildren() override { return true; } - size_t GetIndexOfChildWithName(ConstString name) override { // NSException has 4 members: // NSString *name; diff --git a/lldb/source/Plugins/Language/ObjC/NSSet.cpp b/lldb/source/Plugins/Language/ObjC/NSSet.cpp index a184ec624b63e..55069495676e5 100644 --- a/lldb/source/Plugins/Language/ObjC/NSSet.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSSet.cpp @@ -52,8 +52,6 @@ class NSSetISyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -90,8 +88,6 @@ class NSCFSetSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -123,8 +119,6 @@ class GenericNSSetMSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ChildCacheState Update() override; - bool MightHaveChildren() override; - size_t GetIndexOfChildWithName(ConstString name) override; private: @@ -225,24 +219,7 @@ namespace Foundation1437 { return __NSSetMSize_Impl(process, valobj_addr, error); } } -} - -class NSSetCodeRunningSyntheticFrontEnd : public SyntheticChildrenFrontEnd { -public: - NSSetCodeRunningSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp); - - ~NSSetCodeRunningSyntheticFrontEnd() override; - - llvm::Expected CalculateNumChildren() override; - - lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override; - - lldb::ChildCacheState Update() override; - - bool MightHaveChildren() override; - - size_t GetIndexOfChildWithName(ConstString name) override; -}; + } // namespace Foundation1437 } // namespace formatters } // namespace lldb_private @@ -461,10 +438,6 @@ lldb_private::formatters::NSSetISyntheticFrontEnd::Update() { return lldb::ChildCacheState::eReuse; } -bool lldb_private::formatters::NSSetISyntheticFrontEnd::MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSSetISyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -582,10 +555,6 @@ lldb_private::formatters::NSCFSetSyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -bool lldb_private::formatters::NSCFSetSyntheticFrontEnd::MightHaveChildren() { - return true; -} - lldb::ValueObjectSP lldb_private::formatters::NSCFSetSyntheticFrontEnd::GetChildAtIndex( uint32_t idx) { @@ -739,13 +708,6 @@ lldb_private::formatters::GenericNSSetMSyntheticFrontEnd::Update() { : lldb::ChildCacheState::eRefetch; } -template -bool -lldb_private::formatters:: - GenericNSSetMSyntheticFrontEnd::MightHaveChildren() { - return true; -} - template lldb::ValueObjectSP lldb_private::formatters:: diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index 42fa54634841c..21a5ebe53073a 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -476,3 +476,14 @@ CPPLanguageRuntime::GetStepThroughTrampolinePlan(Thread &thread, return ret_plan_sp; } + +bool CPPLanguageRuntime::IsSymbolARuntimeThunk(const Symbol &symbol) { + llvm::StringRef mangled_name = + symbol.GetMangled().GetMangledName().GetStringRef(); + // Virtual function overriding from a non-virtual base use a "Th" prefix. + // Virtual function overriding from a virtual base must use a "Tv" prefix. + // Virtual function overriding thunks with covariant returns use a "Tc" + // prefix. + return mangled_name.starts_with("_ZTh") || mangled_name.starts_with("_ZTv") || + mangled_name.starts_with("_ZTc"); +} diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h index 57cfe28245808..05639e9798917 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h @@ -78,6 +78,9 @@ class CPPLanguageRuntime : public LanguageRuntime { bool stop_others) override; bool IsAllowedRuntimeValue(ConstString name) override; + + bool IsSymbolARuntimeThunk(const Symbol &symbol) override; + protected: // Classes that inherit from CPPLanguageRuntime can see and modify these CPPLanguageRuntime(Process *process); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index ec0004c70c6da..2d4d22559963f 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -1973,6 +1973,33 @@ class DWARFASTParserClang::DelayedAddObjCClassProperty { ClangASTMetadata m_metadata; }; +static std::optional MakeAPValue(const clang::ASTContext &ast, + CompilerType clang_type, + uint64_t value) { + std::optional bit_width = clang_type.GetBitSize(nullptr); + if (!bit_width) + return std::nullopt; + + bool is_signed = false; + const bool is_integral = clang_type.IsIntegerOrEnumerationType(is_signed); + + llvm::APSInt apint(*bit_width, !is_signed); + apint = value; + + if (is_integral) + return clang::APValue(apint); + + uint32_t count; + bool is_complex; + // FIXME: we currently support a limited set of floating point types. + // E.g., 16-bit floats are not supported. + if (!clang_type.IsFloatingPointType(count, is_complex)) + return std::nullopt; + + return clang::APValue(llvm::APFloat( + ast.getFloatTypeSemantics(ClangUtil::GetQualType(clang_type)), apint)); +} + bool DWARFASTParserClang::ParseTemplateDIE( const DWARFDIE &die, TypeSystemClang::TemplateParameterInfos &template_param_infos) { @@ -2050,28 +2077,26 @@ bool DWARFASTParserClang::ParseTemplateDIE( clang_type = m_ast.GetBasicType(eBasicTypeVoid); if (!is_template_template_argument) { - bool is_signed = false; - // Get the signed value for any integer or enumeration if available - clang_type.IsIntegerOrEnumerationType(is_signed); if (name && !name[0]) name = nullptr; if (tag == DW_TAG_template_value_parameter && uval64_valid) { - std::optional size = clang_type.GetBitSize(nullptr); - if (!size) - return false; - llvm::APInt apint(*size, uval64, is_signed); - template_param_infos.InsertArg( - name, clang::TemplateArgument(ast, llvm::APSInt(apint, !is_signed), - ClangUtil::GetQualType(clang_type), - is_default_template_arg)); - } else { - template_param_infos.InsertArg( - name, clang::TemplateArgument(ClangUtil::GetQualType(clang_type), - /*isNullPtr*/ false, - is_default_template_arg)); + if (auto value = MakeAPValue(ast, clang_type, uval64)) { + template_param_infos.InsertArg( + name, clang::TemplateArgument( + ast, ClangUtil::GetQualType(clang_type), + std::move(*value), is_default_template_arg)); + return true; + } } + + // We get here if this is a type-template parameter or we couldn't create + // a non-type template parameter. + template_param_infos.InsertArg( + name, clang::TemplateArgument(ClangUtil::GetQualType(clang_type), + /*isNullPtr*/ false, + is_default_template_arg)); } else { auto *tplt_type = m_ast.CreateTemplateTemplateParmDecl(template_name); template_param_infos.InsertArg( diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index bcb63f719de10..1e0c7f0514941 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -1311,10 +1311,18 @@ CompilerType TypeSystemClang::CreateRecordType( } namespace { -/// Returns true iff the given TemplateArgument should be represented as an -/// NonTypeTemplateParmDecl in the AST. -bool IsValueParam(const clang::TemplateArgument &argument) { - return argument.getKind() == TemplateArgument::Integral; +/// Returns the type of the template argument iff the given TemplateArgument +/// should be represented as an NonTypeTemplateParmDecl in the AST. Returns +/// a null QualType otherwise. +QualType GetValueParamType(const clang::TemplateArgument &argument) { + switch (argument.getKind()) { + case TemplateArgument::Integral: + return argument.getIntegralType(); + case TemplateArgument::StructuralValue: + return argument.getStructuralValueType(); + default: + return {}; + } } void AddAccessSpecifierDecl(clang::CXXRecordDecl *cxx_record_decl, @@ -1361,8 +1369,8 @@ static TemplateParameterList *CreateTemplateParameterList( if (name && name[0]) identifier_info = &ast.Idents.get(name); TemplateArgument const &targ = args[i]; - if (IsValueParam(targ)) { - QualType template_param_type = targ.getIntegralType(); + QualType template_param_type = GetValueParamType(targ); + if (!template_param_type.isNull()) { template_param_decls.push_back(NonTypeTemplateParmDecl::Create( ast, decl_context, SourceLocation(), SourceLocation(), depth, i, identifier_info, template_param_type, parameter_pack, @@ -1380,10 +1388,11 @@ static TemplateParameterList *CreateTemplateParameterList( identifier_info = &ast.Idents.get(template_param_infos.GetPackName()); const bool parameter_pack_true = true; - if (!template_param_infos.GetParameterPack().IsEmpty() && - IsValueParam(template_param_infos.GetParameterPack().Front())) { - QualType template_param_type = - template_param_infos.GetParameterPack().Front().getIntegralType(); + QualType template_param_type = + !template_param_infos.GetParameterPack().IsEmpty() + ? GetValueParamType(template_param_infos.GetParameterPack().Front()) + : QualType(); + if (!template_param_type.isNull()) { template_param_decls.push_back(NonTypeTemplateParmDecl::Create( ast, decl_context, SourceLocation(), SourceLocation(), depth, num_template_params, identifier_info, template_param_type, @@ -1458,10 +1467,12 @@ static bool TemplateParameterAllowsValue(NamedDecl *param, } else if (auto *type_param = llvm::dyn_cast(param)) { // Compare the argument kind, i.e. ensure that != . - if (!IsValueParam(value)) + QualType value_param_type = GetValueParamType(value); + if (value_param_type.isNull()) return false; + // Compare the integral type, i.e. ensure that != . - if (type_param->getType() != value.getIntegralType()) + if (type_param->getType() != value_param_type) return false; } else { // There is no way to create other parameter decls at the moment, so we @@ -7351,10 +7362,27 @@ TypeSystemClang::GetIntegralTemplateArgument(lldb::opaque_compiler_type_t type, return std::nullopt; const auto *arg = GetNthTemplateArgument(template_decl, idx, expand_pack); - if (!arg || arg->getKind() != clang::TemplateArgument::Integral) + if (!arg) return std::nullopt; - return {{arg->getAsIntegral(), GetType(arg->getIntegralType())}}; + switch (arg->getKind()) { + case clang::TemplateArgument::Integral: + return {{arg->getAsIntegral(), GetType(arg->getIntegralType())}}; + case clang::TemplateArgument::StructuralValue: { + clang::APValue value = arg->getAsStructuralValue(); + CompilerType type = GetType(arg->getStructuralValueType()); + + if (value.isFloat()) + return {{value.getFloat(), type}}; + + if (value.isInt()) + return {{value.getInt(), type}}; + + return std::nullopt; + } + default: + return std::nullopt; + } } CompilerType TypeSystemClang::GetTypeForFormatters(void *type) { diff --git a/lldb/source/Target/ThreadPlanShouldStopHere.cpp b/lldb/source/Target/ThreadPlanShouldStopHere.cpp index e72f8d8f51a20..fa6bc08a9914d 100644 --- a/lldb/source/Target/ThreadPlanShouldStopHere.cpp +++ b/lldb/source/Target/ThreadPlanShouldStopHere.cpp @@ -8,6 +8,7 @@ #include "lldb/Target/ThreadPlanShouldStopHere.h" #include "lldb/Symbol/Symbol.h" +#include "lldb/Target/LanguageRuntime.h" #include "lldb/Target/RegisterContext.h" #include "lldb/Target/Thread.h" #include "lldb/Utility/LLDBLog.h" @@ -76,6 +77,19 @@ bool ThreadPlanShouldStopHere::DefaultShouldStopHereCallback( } } + // Check whether the frame we are in is a language runtime thunk, only for + // step out: + if (operation == eFrameCompareOlder) { + if (Symbol *symbol = frame->GetSymbolContext(eSymbolContextSymbol).symbol) { + ProcessSP process_sp(current_plan->GetThread().GetProcess()); + for (auto *runtime : process_sp->GetLanguageRuntimes()) { + if (runtime->IsSymbolARuntimeThunk(*symbol)) { + should_stop_here = false; + break; + } + } + } + } // Always avoid code with line number 0. // FIXME: At present the ShouldStop and the StepFromHere calculate this // independently. If this ever @@ -109,18 +123,35 @@ ThreadPlanSP ThreadPlanShouldStopHere::DefaultStepFromHereCallback( if (sc.line_entry.line == 0) { AddressRange range = sc.line_entry.range; - - // If the whole function is marked line 0 just step out, that's easier & - // faster than continuing to step through it. bool just_step_out = false; - if (sc.symbol && sc.symbol->ValueIsAddress()) { - Address symbol_end = sc.symbol->GetAddress(); - symbol_end.Slide(sc.symbol->GetByteSize() - 1); - if (range.ContainsFileAddress(sc.symbol->GetAddress()) && - range.ContainsFileAddress(symbol_end)) { - LLDB_LOGF(log, "Stopped in a function with only line 0 lines, just " - "stepping out."); - just_step_out = true; + if (sc.symbol) { + ProcessSP process_sp(current_plan->GetThread().GetProcess()); + + // If this is a runtime thunk, step through it, rather than stepping out + // because it's marked line 0. + bool is_thunk = false; + for (auto *runtime : process_sp->GetLanguageRuntimes()) { + if (runtime->IsSymbolARuntimeThunk(*sc.symbol)) { + LLDB_LOGF(log, "In runtime thunk %s - stepping out.", + sc.symbol->GetName().GetCString()); + is_thunk = true; + break; + } + } + + // If the whole function is marked line 0 just step out, that's easier & + // faster than continuing to step through it. + // FIXME: This assumes that the function is a single line range. It could + // be a series of contiguous line 0 ranges. Check for that too. + if (!is_thunk && sc.symbol->ValueIsAddress()) { + Address symbol_end = sc.symbol->GetAddress(); + symbol_end.Slide(sc.symbol->GetByteSize() - 1); + if (range.ContainsFileAddress(sc.symbol->GetAddress()) && + range.ContainsFileAddress(symbol_end)) { + LLDB_LOGF(log, "Stopped in a function with only line 0 lines, just " + "stepping out."); + just_step_out = true; + } } } if (!just_step_out) { diff --git a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py index db5388b8bcc6d..eac7b5ef1099a 100644 --- a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py +++ b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py @@ -62,10 +62,44 @@ def test(self): self.assertEqual(template_param_value.GetTypeName(), "char") self.assertEqual(chr(template_param_value.GetValueAsSigned()), "v") - # FIXME: type should be Foo - # FIXME: double/float NTTP parameter values currently not supported. - value = self.expect_expr("temp4", result_type="Foo") + value = self.expect_expr("temp4", result_type="Foo") template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) self.assertEqual(template_param_value.GetTypeName(), "float") # FIXME: this should return a float self.assertEqual(template_param_value.GetValueAsSigned(), 2) + + value = self.expect_expr("temp5", result_type="Foo") + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertEqual(template_param_value.GetTypeName(), "double") + # FIXME: this should return a float + self.assertEqual(template_param_value.GetValueAsSigned(), -250) + + # FIXME: type should be Foo + value = self.expect_expr("temp6", result_type="Foo") + self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1)) + + # FIXME: support wider range of floating point types + value = self.expect_expr("temp7", result_type="Foo<__fp16, __fp16>") + self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1)) + + value = self.expect_expr("temp8", result_type="Foo<__fp16, __fp16>") + self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1)) + + value = self.expect_expr("temp9", result_type="Bar") + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertEqual(template_param_value.GetTypeName(), "double") + # FIXME: this should return a float + self.assertEqual(template_param_value.GetValueAsSigned(), 1) + + value = self.expect_expr( + "temp10", result_type="Bar" + ) + template_param_value = value.GetType().GetTemplateArgumentValue(target, 1) + self.assertEqual(template_param_value.GetTypeName(), "float") + # FIXME: this should return a float + self.assertEqual(template_param_value.GetValueAsSigned(), 1) + + template_param_value = value.GetType().GetTemplateArgumentValue(target, 2) + self.assertEqual(template_param_value.GetTypeName(), "float") + # FIXME: this should return a float + self.assertEqual(template_param_value.GetValueAsSigned(), 2) diff --git a/lldb/test/API/lang/cpp/template-arguments/main.cpp b/lldb/test/API/lang/cpp/template-arguments/main.cpp index 0c0eb97cbc858..c08679aa0e166 100644 --- a/lldb/test/API/lang/cpp/template-arguments/main.cpp +++ b/lldb/test/API/lang/cpp/template-arguments/main.cpp @@ -9,5 +9,13 @@ template struct Foo {}; Foo temp2; Foo temp3; Foo temp4; +Foo temp5; +Foo temp6; +Foo<_Float16, _Float16(1.0)> temp7; +Foo<__bf16, __bf16(1.0)> temp8; + +template struct Bar {}; +Bar temp9; +Bar temp10; int main() {} diff --git a/lldb/test/API/lang/cpp/thunk/Makefile b/lldb/test/API/lang/cpp/thunk/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/lang/cpp/thunk/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/lang/cpp/thunk/TestThunk.py b/lldb/test/API/lang/cpp/thunk/TestThunk.py new file mode 100644 index 0000000000000..9370c1c58c18b --- /dev/null +++ b/lldb/test/API/lang/cpp/thunk/TestThunk.py @@ -0,0 +1,47 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class ThunkTest(TestBase): + def test_step_through_thunk(self): + self.build() + lldbutil.run_to_name_breakpoint(self, "testit") + + # Make sure we step through the thunk into Derived1::doit + self.expect( + "step", + STEP_IN_SUCCEEDED, + substrs=["stop reason = step in", "Derived1::doit"], + ) + + self.runCmd("continue") + + self.expect( + "step", + STEP_IN_SUCCEEDED, + substrs=["stop reason = step in", "Derived2::doit"], + ) + + @skipIfWindows + def test_step_out_thunk(self): + self.build() + lldbutil.run_to_name_breakpoint(self, "testit_debug") + + # Make sure we step out of the thunk and end up in testit_debug. + source = "main.cpp" + line = line_number(source, "// Step here") + self.expect( + "step", + STEP_IN_SUCCEEDED, + substrs=["stop reason = step in", "{}:{}".format(source, line)], + ) + + self.runCmd("continue") + + self.expect( + "step", + STEP_IN_SUCCEEDED, + substrs=["stop reason = step in", "Derived2::doit_debug"], + ) diff --git a/lldb/test/API/lang/cpp/thunk/main.cpp b/lldb/test/API/lang/cpp/thunk/main.cpp new file mode 100644 index 0000000000000..82d17b1350093 --- /dev/null +++ b/lldb/test/API/lang/cpp/thunk/main.cpp @@ -0,0 +1,48 @@ +#include + +class Base1 { +public: + virtual ~Base1() {} +}; + +class Base2 { +public: + virtual void doit() = 0; + virtual void doit_debug() = 0; +}; + +Base2 *b; + +class Derived1 : public Base1, public Base2 { +public: + virtual void doit() { printf("Derived1\n"); } + virtual void __attribute__((nodebug)) doit_debug() { + printf("Derived1 (no debug)\n"); + } +}; + +class Derived2 : public Base2 { +public: + virtual void doit() { printf("Derived2\n"); } + virtual void doit_debug() { printf("Derived2 (debug)\n"); } +}; + +void testit() { b->doit(); } + +void testit_debug() { + b->doit_debug(); + printf("This is where I should step out to with nodebug.\n"); // Step here +} + +int main() { + + b = new Derived1(); + testit(); + testit_debug(); + + b = new Derived2(); + testit(); + testit_debug(); + + return 0; +} diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp index 23374062127e0..a9b0c87c4fbce 100644 --- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp +++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp @@ -525,7 +525,17 @@ TEST_F(TestTypeSystemClang, TemplateArguments) { infos.InsertArg("I", TemplateArgument(m_ast->getASTContext(), arg, m_ast->getASTContext().IntTy)); - // template struct foo; + llvm::APFloat float_arg(5.5f); + infos.InsertArg("F", TemplateArgument(m_ast->getASTContext(), + m_ast->getASTContext().FloatTy, + clang::APValue(float_arg))); + + llvm::APFloat double_arg(-15.2); + infos.InsertArg("D", TemplateArgument(m_ast->getASTContext(), + m_ast->getASTContext().DoubleTy, + clang::APValue(double_arg))); + + // template struct foo; ClassTemplateDecl *decl = m_ast->CreateClassTemplateDecl( m_ast->GetTranslationUnitDecl(), OptionalClangModuleID(), eAccessPublic, "foo", llvm::to_underlying(clang::TagTypeKind::Struct), infos); @@ -555,6 +565,10 @@ TEST_F(TestTypeSystemClang, TemplateArguments) { CompilerType int_type(m_ast->weak_from_this(), m_ast->getASTContext().IntTy.getAsOpaquePtr()); + CompilerType float_type(m_ast->weak_from_this(), + m_ast->getASTContext().FloatTy.getAsOpaquePtr()); + CompilerType double_type(m_ast->weak_from_this(), + m_ast->getASTContext().DoubleTy.getAsOpaquePtr()); for (CompilerType t : {type, typedef_type, auto_type}) { SCOPED_TRACE(t.GetTypeName().AsCString()); @@ -577,8 +591,32 @@ TEST_F(TestTypeSystemClang, TemplateArguments) { auto result = m_ast->GetIntegralTemplateArgument(t.GetOpaqueQualType(), 1, expand_pack); ASSERT_NE(std::nullopt, result); - EXPECT_EQ(arg, result->value); + EXPECT_EQ(arg, result->value.GetAPSInt()); EXPECT_EQ(int_type, result->type); + + EXPECT_EQ( + m_ast->GetTemplateArgumentKind(t.GetOpaqueQualType(), 2, expand_pack), + eTemplateArgumentKindStructuralValue); + EXPECT_EQ( + m_ast->GetTypeTemplateArgument(t.GetOpaqueQualType(), 2, expand_pack), + CompilerType()); + auto float_result = m_ast->GetIntegralTemplateArgument( + t.GetOpaqueQualType(), 2, expand_pack); + ASSERT_NE(std::nullopt, float_result); + EXPECT_EQ(float_arg, float_result->value.GetAPFloat()); + EXPECT_EQ(float_type, float_result->type); + + EXPECT_EQ( + m_ast->GetTemplateArgumentKind(t.GetOpaqueQualType(), 3, expand_pack), + eTemplateArgumentKindStructuralValue); + EXPECT_EQ( + m_ast->GetTypeTemplateArgument(t.GetOpaqueQualType(), 3, expand_pack), + CompilerType()); + auto double_result = m_ast->GetIntegralTemplateArgument( + t.GetOpaqueQualType(), 3, expand_pack); + ASSERT_NE(std::nullopt, double_result); + EXPECT_EQ(double_arg, double_result->value.GetAPFloat()); + EXPECT_EQ(double_type, double_result->type); } } diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index c128fd2ed125c..15ae04f5a6913 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -390,7 +390,14 @@ if (NOT PURE_WINDOWS) if( HAVE_LIBDL ) list(APPEND CMAKE_REQUIRED_LIBRARIES dl) endif() + # Add the _XOPEN_SOURCE macro on z/OS, as certain test(s) use dlopen + if (ZOS) + list(APPEND CMAKE_REQUIRED_DEFINITIONS "-D_XOPEN_SOURCE=600") + endif() check_symbol_exists(dlopen dlfcn.h HAVE_DLOPEN) + if (ZOS) + list(REMOVE_ITEM CMAKE_REQUIRED_DEFINITIONS "-D_XOPEN_SOURCE=600") + endif() if( HAVE_LIBDL ) list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES dl) endif() diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index b5b75db91e1c4..979b87c8d02f6 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -438,7 +438,7 @@ Releases Backporting Fixes to the Release Branches ----------------------------------------- You can use special comments on issues or pull requests to make backport -requests for the release branches. To do this, after your pull reuest has been +requests for the release branches. To do this, after your pull request has been merged: 1. Edit "Milestone" at the right side of the isssue or pull request diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index c9543ff09217a..75638d75e70f2 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -155,6 +155,8 @@ Changes to LLDB does not provide API to query the number of supported hardware watchpoints. Therefore current implementation allows only 1 watchpoint, as tested with Windows 11 on the Microsoft SQ2 and Snapdragon Elite X platforms. +* LLDB now steps through C++ thunks. This fixes an issue where previously, it + wouldn't step into multiple inheritance virtual functions. ### Changes to lldb-dap diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst index f41c5ed0ad0cd..001a314cb1331 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst @@ -313,7 +313,7 @@ in "``TheModule``"s symbol table. Finally, we set the name of each of the function's arguments according to the names given in the Prototype. This step isn't strictly necessary, but keeping the names consistent makes the IR more readable, and allows subsequent code to -refer directly to the arguments for their names, rather than having to look up +refer directly to the arguments for their names, rather than having to look them up in the Prototype AST. At this point we have a function prototype with no body. This is how LLVM IR diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index 1b49f8a3e85b1..67f9f24c3b7a4 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -999,11 +999,6 @@ bool isGuaranteedToExecuteForEveryIteration(const Instruction *I, /// getGuaranteedNonPoisonOp. bool propagatesPoison(const Use &PoisonOp); -/// Insert operands of I into Ops such that I will trigger undefined behavior -/// if I is executed and that operand has a poison value. -void getGuaranteedNonPoisonOps(const Instruction *I, - SmallVectorImpl &Ops); - /// Return true if the given instruction must trigger undefined behavior /// when I is executed with any operands which appear in KnownPoison holding /// a poison value at the point of execution. diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 68ed812222dfd..665c4d6baad80 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1459,6 +1459,23 @@ enum NodeType { VECREDUCE_UMAX, VECREDUCE_UMIN, + // PARTIAL_REDUCE_[U|S]MLA(Accumulator, Input1, Input2) + // The partial reduction nodes sign or zero extend Input1 and Input2 to the + // element type of Accumulator before multiplying their results. + // This result is concatenated to the Accumulator, and this is then reduced, + // using addition, to the result type. + // The output is only expected to either be given to another partial reduction + // operation or an equivalent vector reduce operation, so the order in which + // the elements are reduced is deliberately not specified. + // Input1 and Input2 must be the same type. Accumulator and the output must be + // the same type. + // The number of elements in Input1 and Input2 must be a positive integer + // multiple of the number of elements in the Accumulator / output type. + // Input1 and Input2 must have an element type which is the same as or smaller + // than the element type of the Accumulator and output. + PARTIAL_REDUCE_SMLA, + PARTIAL_REDUCE_UMLA, + // The `llvm.experimental.stackmap` intrinsic. // Operands: input chain, glue, , , [live0[, live1...]] // Outputs: output chain, glue diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/include/llvm/CodeGen/RegAllocEvictionAdvisor.h similarity index 71% rename from llvm/lib/CodeGen/RegAllocEvictionAdvisor.h rename to llvm/include/llvm/CodeGen/RegAllocEvictionAdvisor.h index 52dd946a68540..a14548ff6959e 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h +++ b/llvm/include/llvm/CodeGen/RegAllocEvictionAdvisor.h @@ -9,13 +9,18 @@ #ifndef LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H #define LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H +#include "llvm/ADT/Any.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Register.h" #include "llvm/Config/llvm-config.h" +#include "llvm/IR/PassManager.h" #include "llvm/MC/MCRegister.h" #include "llvm/Pass.h" +#include "llvm/Support/Compiler.h" namespace llvm { class AllocationOrder; @@ -149,6 +154,35 @@ class RegAllocEvictionAdvisor { const bool EnableLocalReassign; }; +/// Common provider for legacy and new pass managers. +/// This keeps the state for logging, and sets up and holds the provider. +/// The legacy pass itself used to keep the logging state and provider, +/// so this extraction helps the NPM analysis to reuse the logic. +/// TODO: Coalesce this with the NPM analysis when legacy PM is removed. +class RegAllocEvictionAdvisorProvider { +public: + enum class AdvisorMode : int { Default, Release, Development }; + RegAllocEvictionAdvisorProvider(AdvisorMode Mode, LLVMContext &Ctx) + : Ctx(Ctx), Mode(Mode) {} + + virtual ~RegAllocEvictionAdvisorProvider() = default; + + virtual void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) {} + + virtual std::unique_ptr + getAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MachineBlockFrequencyInfo *MBFI, MachineLoopInfo *Loops) = 0; + + AdvisorMode getAdvisorMode() const { return Mode; } + +protected: + LLVMContext &Ctx; + +private: + const AdvisorMode Mode; +}; + /// ImmutableAnalysis abstraction for fetching the Eviction Advisor. We model it /// as an analysis to decouple the user from the implementation insofar as /// dependencies on other analyses goes. The motivation for it being an @@ -164,20 +198,20 @@ class RegAllocEvictionAdvisor { /// /// Because we need to offer additional services in 'development' mode, the /// implementations of this analysis need to implement RTTI support. -class RegAllocEvictionAdvisorAnalysis : public ImmutablePass { +class RegAllocEvictionAdvisorAnalysisLegacy : public ImmutablePass { public: enum class AdvisorMode : int { Default, Release, Development }; - RegAllocEvictionAdvisorAnalysis(AdvisorMode Mode) - : ImmutablePass(ID), Mode(Mode){}; + RegAllocEvictionAdvisorAnalysisLegacy(AdvisorMode Mode) + : ImmutablePass(ID), Mode(Mode) {}; static char ID; /// Get an advisor for the given context (i.e. machine function, etc) - virtual std::unique_ptr - getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0; + RegAllocEvictionAdvisorProvider &getProvider() { return *Provider; } + AdvisorMode getAdvisorMode() const { return Mode; } virtual void logRewardIfNeeded(const MachineFunction &MF, - llvm::function_ref GetReward){}; + function_ref GetReward) {}; protected: // This analysis preserves everything, and subclasses may have additional @@ -185,19 +219,65 @@ class RegAllocEvictionAdvisorAnalysis : public ImmutablePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); } + std::unique_ptr Provider; private: StringRef getPassName() const override; const AdvisorMode Mode; }; +/// A MachineFunction analysis for fetching the Eviction Advisor. +/// This sets up the Provider lazily and caches it. +/// - in the ML implementation case, the evaluator is stateless but (especially +/// in the development mode) expensive to set up. With a Module Analysis, we +/// `require` it and set it up once. +/// - in the 'development' mode ML case, we want to capture the training log +/// during allocation (this is a log of features encountered and decisions +/// made), and then measure a score, potentially a few steps after allocation +/// completes. So we need a Module analysis to keep the logger state around +/// until we can make that measurement. +class RegAllocEvictionAdvisorAnalysis + : public AnalysisInfoMixin { + static AnalysisKey Key; + friend AnalysisInfoMixin; + +public: + struct Result { + // owned by this analysis + RegAllocEvictionAdvisorProvider *Provider; + + bool invalidate(MachineFunction &MF, const PreservedAnalyses &PA, + MachineFunctionAnalysisManager::Invalidator &Inv) { + // Provider is stateless and constructed only once. Do not get + // invalidated. + return false; + } + }; + + Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MAM); + +private: + void + initializeProvider(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode Mode, + LLVMContext &Ctx); + + std::unique_ptr Provider; +}; + /// Specialization for the API used by the analysis infrastructure to create /// an instance of the eviction advisor. -template <> Pass *callDefaultCtor(); +template <> Pass *callDefaultCtor(); + +RegAllocEvictionAdvisorAnalysisLegacy *createReleaseModeAdvisorAnalysisLegacy(); + +RegAllocEvictionAdvisorAnalysisLegacy * +createDevelopmentModeAdvisorAnalysisLegacy(); -RegAllocEvictionAdvisorAnalysis *createReleaseModeAdvisor(); +LLVM_ATTRIBUTE_RETURNS_NONNULL RegAllocEvictionAdvisorProvider * +createReleaseModeAdvisorProvider(LLVMContext &Ctx); -RegAllocEvictionAdvisorAnalysis *createDevelopmentModeAdvisor(); +RegAllocEvictionAdvisorProvider * +createDevelopmentModeAdvisorProvider(LLVMContext &Ctx); // TODO: move to RegAllocEvictionAdvisor.cpp when we move implementation // out of RegAllocGreedy.cpp diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 461c0c1ead16d..cf8e4a3d2513b 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1607,11 +1607,6 @@ class SelectionDAG { /// the target's desired shift amount type. SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op); - /// Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are - /// its operands and ReducedTY is the intrinsic's return type. - SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, - SDValue Op2); - /// Expands a node with multiple results to an FP or vector libcall. The /// libcall is expected to take all the operands of the \p Node followed by /// output pointers for each of the results. \p CallRetResNo can be optionally diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index bbecc7a6ddaee..a4c3d042fe3a4 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5564,6 +5564,10 @@ class TargetLowering : public TargetLoweringBase { /// temporarily, advance store position, before re-loading the final vector. SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const; + /// Expands PARTIAL_REDUCE_S/UMLA nodes to a series of simpler operations, + /// consisting of zext/sext, extract_subvector, mul and add operations. + SDValue expandPartialReduceMLA(SDNode *Node, SelectionDAG &DAG) const; + /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC /// on the current target. A VP_SETCC will additionally be given a Mask /// and/or EVL not equal to SDValue(). diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 114149ff53d85..66fd3fb9b0526 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -732,6 +732,22 @@ class TargetRegisterInfo : public MCRegisterInfo { return composeSubRegIndicesImpl(a, b); } + /// Return a subregister index that will compose to give you the subregister + /// index. + /// + /// Finds a subregister index x such that composeSubRegIndices(a, x) == + /// b. Note that this relationship does not hold if + /// reverseComposeSubRegIndices returns the null subregister. + /// + /// The special null sub-register index composes as the identity. + unsigned reverseComposeSubRegIndices(unsigned a, unsigned b) const { + if (!a) + return b; + if (!b) + return a; + return reverseComposeSubRegIndicesImpl(a, b); + } + /// Transforms a LaneMask computed for one subregister to the lanemask that /// would have been computed when composing the subsubregisters with IdxA /// first. @sa composeSubRegIndices() @@ -774,6 +790,11 @@ class TargetRegisterInfo : public MCRegisterInfo { llvm_unreachable("Target has no sub-registers"); } + /// Overridden by TableGen in targets that have sub-registers. + virtual unsigned reverseComposeSubRegIndicesImpl(unsigned, unsigned) const { + llvm_unreachable("Target has no sub-registers"); + } + /// Overridden by TableGen in targets that have sub-registers. virtual LaneBitmask composeSubRegIndexLaneMaskImpl(unsigned, LaneBitmask) const { diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index da4ffcd83213a..81a602c8889d8 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -251,7 +251,7 @@ void initializePseudoProbeInserterPass(PassRegistry &); void initializeRAGreedyPass(PassRegistry &); void initializeReachingDefAnalysisPass(PassRegistry &); void initializeReassociateLegacyPassPass(PassRegistry &); -void initializeRegAllocEvictionAdvisorAnalysisPass(PassRegistry &); +void initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &); void initializeRegAllocFastPass(PassRegistry &); void initializeRegAllocPriorityAdvisorAnalysisPass(PassRegistry &); void initializeRegAllocScoringPass(PassRegistry &); diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index 1458318ff021a..12781e2b84623 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -57,6 +57,7 @@ #include "llvm/CodeGen/PeepholeOptimizer.h" #include "llvm/CodeGen/PostRASchedulerList.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/RegAllocFast.h" #include "llvm/CodeGen/RegUsageInfoCollector.h" #include "llvm/CodeGen/RegUsageInfoPropagate.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 075ebcb829553..2b5e258682585 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -114,6 +114,7 @@ MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree", MachinePostDominatorTreeAnalysis()) MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis()) MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) +MACHINE_FUNCTION_ANALYSIS("regalloc-evict", RegAllocEvictionAdvisorAnalysis()) MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis()) MACHINE_FUNCTION_ANALYSIS("spill-code-placement", SpillPlacementAnalysis()) MACHINE_FUNCTION_ANALYSIS("virtregmap", VirtRegMapAnalysis()) diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td index 2562ed0901303..f55bff16dcecd 100644 --- a/llvm/include/llvm/Target/TargetSchedule.td +++ b/llvm/include/llvm/Target/TargetSchedule.td @@ -321,9 +321,13 @@ class SchedWriteRes resources> : SchedWrite, // Define values common to ReadAdvance and SchedReadAdvance. // // SchedModel ties these resources to a processor. -class ProcReadAdvance writes = []> { +class ProcReadAdvance writes = [], + list tunables = []> { + assert !le(!size(tunables), !size(writes)), + "cannot have more `tunables' than `writes'"; int Cycles = cycles; list ValidWrites = writes; + list CycleTunables = tunables; // Allow a processor to mark some scheduling classes as unsupported // for stronger verification. bit Unsupported = false; @@ -340,15 +344,17 @@ class ProcReadAdvance writes = []> { // indicate operands that are always read this number of Cycles later // than a normal register read, allowing the read's parent instruction // to issue earlier relative to the writer. -class ReadAdvance writes = []> - : ProcReadAdvance { +class ReadAdvance writes = [], + list tunables = []> + : ProcReadAdvance { SchedRead ReadType = read; } // Directly associate a new SchedRead type with a delay and optional // pipeline bypass. For use with InstRW or ItinRW. -class SchedReadAdvance writes = []> : SchedRead, - ProcReadAdvance; +class SchedReadAdvance writes = [], + list tunables = []> + : SchedRead, ProcReadAdvance; // Define SchedRead defaults. Reads seldom need special treatment. def ReadDefault : SchedRead; diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h index 344a49df5cbf0..42319f3ef51f2 100644 --- a/llvm/include/llvm/Telemetry/Telemetry.h +++ b/llvm/include/llvm/Telemetry/Telemetry.h @@ -138,10 +138,6 @@ class Manager { public: virtual ~Manager() = default; - // Optional callback for subclasses to perform additional tasks before - // dispatching to Destinations. - virtual Error preDispatch(TelemetryInfo *Entry) = 0; - // Dispatch Telemetry data to the Destination(s). // The argument is non-const because the Manager may add or remove // data from the entry. @@ -150,6 +146,11 @@ class Manager { // Register a Destination. void addDestination(std::unique_ptr Destination); +protected: + // Optional callback for subclasses to perform additional tasks before + // dispatching to Destinations. + virtual Error preDispatch(TelemetryInfo *Entry); + private: std::vector> Destinations; }; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7d6dbd51a404d..5a22ac8abc3fc 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -42,13 +42,12 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PassManager.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" @@ -66,7 +65,6 @@ #include using namespace llvm; -using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-accesses" @@ -793,7 +791,8 @@ class AccessAnalysis { } // end anonymous namespace -/// Try to compute the stride for \p AR. Used by getPtrStride. +/// Try to compute a constant stride for \p AR. Used by getPtrStride and +/// isNoWrap. static std::optional getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE) { @@ -835,16 +834,24 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, return Stride; } -static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, - PredicatedScalarEvolution &PSE, const Loop *L); +static bool isNoWrapGEP(Value *Ptr, PredicatedScalarEvolution &PSE, + const Loop *L); -/// Check whether a pointer address cannot wrap. +/// Check whether \p AR is a non-wrapping AddRec, or if \p Ptr is a non-wrapping +/// GEP. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, std::optional Stride = std::nullopt) { + // FIXME: This should probably only return true for NUW. + if (AR->getNoWrapFlags(SCEV::NoWrapMask)) + return true; + + if (PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) + return true; + // The address calculation must not wrap. Otherwise, a dependence could be // inverted. - if (isNoWrapAddRec(Ptr, AR, PSE, L)) + if (isNoWrapGEP(Ptr, PSE, L)) return true; // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, @@ -877,7 +884,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, return true; } - return PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); + return false; } static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, @@ -1445,18 +1452,9 @@ void AccessAnalysis::processMemAccesses() { } } -/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping, -/// i.e. monotonically increasing/decreasing. -static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, - PredicatedScalarEvolution &PSE, const Loop *L) { - - // FIXME: This should probably only return true for NUW. - if (AR->getNoWrapFlags(SCEV::NoWrapMask)) - return true; - - if (PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) - return true; - +/// Check whether \p Ptr is non-wrapping GEP. +static bool isNoWrapGEP(Value *Ptr, PredicatedScalarEvolution &PSE, + const Loop *L) { // Scalar evolution does not propagate the non-wrapping flags to values that // are derived from a non-wrapping induction variable because non-wrapping // could be flow-sensitive. @@ -2815,50 +2813,25 @@ bool LoopAccessInfo::isInvariant(Value *V) const { return SE->isLoopInvariant(S, TheLoop); } -/// Find the operand of the GEP that should be checked for consecutive -/// stores. This ignores trailing indices that have no effect on the final -/// pointer. -static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { - const DataLayout &DL = Gep->getDataLayout(); - unsigned LastOperand = Gep->getNumOperands() - 1; - TypeSize GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType()); - - // Walk backwards and try to peel off zeros. - while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { - // Find the type we're currently indexing into. - gep_type_iterator GEPTI = gep_type_begin(Gep); - std::advance(GEPTI, LastOperand - 2); - - // If it's a type with the same allocation size as the result of the GEP we - // can peel off the zero index. - TypeSize ElemSize = GEPTI.isStruct() - ? DL.getTypeAllocSize(GEPTI.getIndexedType()) - : GEPTI.getSequentialElementStride(DL); - if (ElemSize != GEPAllocSize) - break; - --LastOperand; - } - - return LastOperand; -} - -/// If the argument is a GEP, then returns the operand identified by -/// getGEPInductionOperand. However, if there is some other non-loop-invariant -/// operand, it returns that instead. -static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { +/// If \p Ptr is a GEP, which has a loop-variant operand, return that operand. +/// Otherwise, return \p Ptr. +static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE, + Loop *Lp) { auto *GEP = dyn_cast(Ptr); if (!GEP) return Ptr; - unsigned InductionOperand = getGEPInductionOperand(GEP); - - // Check that all of the gep indices are uniform except for our induction - // operand. - for (unsigned I = 0, E = GEP->getNumOperands(); I != E; ++I) - if (I != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(I)), Lp)) - return Ptr; - return GEP->getOperand(InductionOperand); + Value *V = Ptr; + for (const Use &U : GEP->operands()) { + if (!SE->isLoopInvariant(SE->getSCEV(U), Lp)) { + if (V == Ptr) + V = U; + else + // There must be exactly one loop-variant operand. + return Ptr; + } + } + return V; } /// Get the stride of a pointer access in a loop. Looks for symbolic @@ -2873,7 +2846,7 @@ static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *L // pointer, otherwise, we are analyzing the index. Value *OrigPtr = Ptr; - Ptr = stripGetElementPtr(Ptr, SE, Lp); + Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp); const SCEV *V = SE->getSCEV(Ptr); if (Ptr != OrigPtr) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index e4454c42c7857..91a5f194db9dc 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -8193,14 +8193,6 @@ static bool handleGuaranteedNonPoisonOps(const Instruction *I, } } -void llvm::getGuaranteedNonPoisonOps(const Instruction *I, - SmallVectorImpl &Operands) { - handleGuaranteedNonPoisonOps(I, [&](const Value *V) { - Operands.push_back(V); - return false; - }); -} - bool llvm::mustTriggerUB(const Instruction *I, const SmallPtrSetImpl &KnownPoison) { return handleGuaranteedNonPoisonOps( diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp index 9656774c6eaae..1a8e11de909e8 100644 --- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp @@ -11,11 +11,11 @@ //===----------------------------------------------------------------------===// #include "AllocationOrder.h" -#include "RegAllocEvictionAdvisor.h" #include "RegAllocGreedy.h" #include "llvm/Analysis/InteractiveModelRunner.h" #include "llvm/Analysis/MLModelRunner.h" #include "llvm/Analysis/TensorSpec.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) || defined(LLVM_HAVE_TFLITE) #include "llvm/Analysis/ModelUnderTrainingRunner.h" #include "llvm/Analysis/NoInferenceModelRunner.h" @@ -115,7 +115,7 @@ class RegAllocScoring : public MachineFunctionPass { /// RegAllocReward analysis usage. void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -389,11 +389,12 @@ class MLEvictAdvisor : public RegAllocEvictionAdvisor { // =================================== // Release (AOT) - specifics // =================================== -class ReleaseModeEvictionAdvisorAnalysis final - : public RegAllocEvictionAdvisorAnalysis { +/// Common provider for legacy and new pass managers. +class ReleaseModeEvictionAdvisorProvider final + : public RegAllocEvictionAdvisorProvider { public: - ReleaseModeEvictionAdvisorAnalysis() - : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) { + ReleaseModeEvictionAdvisorProvider(LLVMContext &Ctx) + : RegAllocEvictionAdvisorProvider(AdvisorMode::Release, Ctx) { if (EnableDevelopmentFeatures) { InputFeatures = {RA_EVICT_FEATURES_LIST( _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES) @@ -403,21 +404,13 @@ class ReleaseModeEvictionAdvisorAnalysis final } } // support for isa<> and dyn_cast. - static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { + static bool classof(const RegAllocEvictionAdvisorProvider *R) { return R->getAdvisorMode() == AdvisorMode::Release; } -private: - std::vector InputFeatures; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU); - } - std::unique_ptr - getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + getAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MachineBlockFrequencyInfo *MBFI, MachineLoopInfo *Loops) override { if (!Runner) { if (InteractiveChannelBaseName.empty()) Runner = std::make_unique>( @@ -428,14 +421,45 @@ class ReleaseModeEvictionAdvisorAnalysis final InteractiveChannelBaseName + ".out", InteractiveChannelBaseName + ".in"); } - return std::make_unique( - MF, RA, Runner.get(), - getAnalysis().getMBFI(), - getAnalysis().getLI()); + assert(MBFI && Loops && + "Invalid provider state: must have analysis available"); + return std::make_unique(MF, RA, Runner.get(), *MBFI, + *Loops); } + +private: + std::vector InputFeatures; std::unique_ptr Runner; }; +class ReleaseModeEvictionAdvisorAnalysisLegacy final + : public RegAllocEvictionAdvisorAnalysisLegacy { +public: + ReleaseModeEvictionAdvisorAnalysisLegacy() + : RegAllocEvictionAdvisorAnalysisLegacy(AdvisorMode::Release) {} + + void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) override { + // No-op in release mode + } + + bool doInitialization(Module &M) override { + Provider = + std::make_unique(M.getContext()); + return false; + } + + static bool classof(const RegAllocEvictionAdvisorAnalysisLegacy *R) { + return R->getAdvisorMode() == AdvisorMode::Release; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + RegAllocEvictionAdvisorAnalysisLegacy::getAnalysisUsage(AU); + } +}; + // =================================== // Development mode-specifics // =================================== @@ -468,11 +492,11 @@ class DevelopmentModeEvictAdvisor : public MLEvictAdvisor { Logger *const Log; }; -class DevelopmentModeEvictionAdvisorAnalysis final - : public RegAllocEvictionAdvisorAnalysis { +class DevelopmentModeEvictionAdvisorProvider final + : public RegAllocEvictionAdvisorProvider { public: - DevelopmentModeEvictionAdvisorAnalysis() - : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) { + DevelopmentModeEvictionAdvisorProvider(LLVMContext &Ctx) + : RegAllocEvictionAdvisorProvider(AdvisorMode::Development, Ctx) { if (EnableDevelopmentFeatures) { InputFeatures = {RA_EVICT_FEATURES_LIST( _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES) @@ -492,44 +516,10 @@ class DevelopmentModeEvictionAdvisorAnalysis final TensorSpec::createSpec("action_step_type", {1}), TensorSpec::createSpec("action_reward", {1})}; } - } - // support for isa<> and dyn_cast. - static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { - return R->getAdvisorMode() == AdvisorMode::Development; - } - - void logRewardIfNeeded(const MachineFunction &MF, - llvm::function_ref GetReward) override { - if (!Log || !Log->hasAnyObservationForContext(MF.getName())) - return; - // The function pass manager would run all the function passes for a - // function, so we assume the last context belongs to this function. If - // this invariant ever changes, we can implement at that time switching - // contexts. At this point, it'd be an error - if (Log->currentContext() != MF.getName()) { - MF.getFunction().getContext().emitError( - "The training log context shouldn't have had changed."); - } - if (Log->hasObservationInProgress()) - Log->logReward(GetReward()); - } - -private: - std::vector InputFeatures; - std::vector TrainingInputFeatures; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU); - } - - bool doInitialization(Module &M) override { - LLVMContext &Ctx = M.getContext(); if (ModelUnderTraining.empty() && TrainingLog.empty()) { Ctx.emitError("Regalloc development mode should be requested with at " "least logging enabled and/or a training model"); - return false; + return; } if (ModelUnderTraining.empty()) Runner = std::make_unique(Ctx, InputFeatures); @@ -538,15 +528,15 @@ class DevelopmentModeEvictionAdvisorAnalysis final Ctx, ModelUnderTraining, DecisionName, TrainingInputFeatures); if (!Runner) { Ctx.emitError("Regalloc: could not set up the model runner"); - return false; + return; } if (TrainingLog.empty()) - return false; + return; std::error_code EC; auto OS = std::make_unique(TrainingLog, EC); if (EC) { - M.getContext().emitError(EC.message() + ":" + TrainingLog); - return false; + Ctx.emitError(EC.message() + ":" + TrainingLog); + return; } std::vector LFS = InputFeatures; if (auto *MUTR = dyn_cast(Runner.get())) @@ -558,25 +548,80 @@ class DevelopmentModeEvictionAdvisorAnalysis final Log = std::make_unique(std::move(OS), LFS, Reward, /*IncludeReward*/ true); - return false; + return; + } + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorProvider *R) { + return R->getAdvisorMode() == AdvisorMode::Development; + } + + void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) override { + if (!Log || !Log->hasAnyObservationForContext(MF.getName())) + return; + // The function pass manager would run all the function passes for a + // function, so we assume the last context belongs to this function. If + // this invariant ever changes, we can implement at that time switching + // contexts. At this point, it'd be an error + if (Log->currentContext() != MF.getName()) { + MF.getFunction().getContext().emitError( + "The training log context shouldn't have had changed."); + } + if (Log->hasObservationInProgress()) + Log->logReward(GetReward()); } std::unique_ptr - getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + getAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MachineBlockFrequencyInfo *MBFI, MachineLoopInfo *Loops) override { if (!Runner) return nullptr; if (Log) Log->switchContext(MF.getName()); + assert(MBFI && Loops && + "Invalid provider state: must have analysis available"); return std::make_unique( - MF, RA, Runner.get(), - getAnalysis().getMBFI(), - getAnalysis().getLI(), Log.get()); + MF, RA, Runner.get(), *MBFI, *Loops, Log.get()); } +private: + std::vector InputFeatures; + std::vector TrainingInputFeatures; + std::unique_ptr Runner; std::unique_ptr Log; }; +class DevelopmentModeEvictionAdvisorAnalysisLegacy final + : public RegAllocEvictionAdvisorAnalysisLegacy { +public: + DevelopmentModeEvictionAdvisorAnalysisLegacy() + : RegAllocEvictionAdvisorAnalysisLegacy(AdvisorMode::Development) {} + + bool doInitialization(Module &M) override { + Provider = std::make_unique( + M.getContext()); + return false; + } + + void logRewardIfNeeded(const MachineFunction &MF, + llvm::function_ref GetReward) override { + Provider->logRewardIfNeeded(MF, GetReward); + } + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorAnalysisLegacy *R) { + return R->getAdvisorMode() == AdvisorMode::Development; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + RegAllocEvictionAdvisorAnalysisLegacy::getAnalysisUsage(AU); + } +}; + #endif // #ifdef LLVM_HAVE_TFLITE } // namespace @@ -1127,8 +1172,9 @@ void llvm::extractMBBFrequency( // Development mode-specific implementations #ifdef LLVM_HAVE_TFLITE -RegAllocEvictionAdvisorAnalysis *llvm::createDevelopmentModeAdvisor() { - return new DevelopmentModeEvictionAdvisorAnalysis(); +RegAllocEvictionAdvisorAnalysisLegacy * +llvm::createDevelopmentModeAdvisorAnalysisLegacy() { + return new DevelopmentModeEvictionAdvisorAnalysisLegacy(); } int64_t DevelopmentModeEvictAdvisor::tryFindEvictionCandidatePosition( @@ -1194,18 +1240,32 @@ bool RegAllocScoring::runOnMachineFunction(MachineFunction &MF) { return *CachedReward; }; - getAnalysis().logRewardIfNeeded(MF, - GetReward); + getAnalysis().logRewardIfNeeded( + MF, GetReward); getAnalysis().logRewardIfNeeded(MF, GetReward); return false; } #endif // #ifdef LLVM_HAVE_TFLITE -RegAllocEvictionAdvisorAnalysis *llvm::createReleaseModeAdvisor() { +RegAllocEvictionAdvisorProvider * +llvm::createReleaseModeAdvisorProvider(LLVMContext &Ctx) { + return new ReleaseModeEvictionAdvisorProvider(Ctx); +} + +RegAllocEvictionAdvisorProvider * +llvm::createDevelopmentModeAdvisorProvider(LLVMContext &Ctx) { +#if defined(LLVM_HAVE_TFLITE) + return new DevelopmentModeEvictionAdvisorProvider(Ctx); +#endif + return nullptr; +} + +RegAllocEvictionAdvisorAnalysisLegacy * +llvm::createReleaseModeAdvisorAnalysisLegacy() { return llvm::isEmbeddedModelEvaluatorValid() || !InteractiveChannelBaseName.empty() - ? new ReleaseModeEvictionAdvisorAnalysis() + ? new ReleaseModeEvictionAdvisorAnalysisLegacy() : nullptr; } diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 745c0d4b36a62..24bd9938bc45c 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1984,12 +1984,43 @@ ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() { // We are looking at: // Def = REG_SEQUENCE v0, sub0, v1, sub1, ... - // Check if one of the operand defines the subreg we are interested in. + // + // Check if one of the operands exactly defines the subreg we are interested + // in. for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) { if (RegSeqInput.SubIdx == DefSubReg) return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg); } + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + + // If we did not find an exact match, see if we can do a composition to + // extract a sub-subregister. + for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) { + // We don't check if the resulting class supports the subregister index + // yet. This will occur before any rewrite when looking for an eligible + // source. + + LaneBitmask DefMask = TRI->getSubRegIndexLaneMask(DefSubReg); + LaneBitmask ThisOpRegMask = TRI->getSubRegIndexLaneMask(RegSeqInput.SubIdx); + + // Check that this extract reads a subset of this single reg_sequence input. + // + // FIXME: We should be able to filter this in terms of the indexes directly + // without checking the lanemasks. + if ((DefMask & ThisOpRegMask) != DefMask) + continue; + + unsigned ReverseDefCompose = + TRI->reverseComposeSubRegIndices(RegSeqInput.SubIdx, DefSubReg); + if (!ReverseDefCompose) + continue; + + unsigned ComposedDefInSrcReg1 = + TRI->composeSubRegIndices(RegSeqInput.SubReg, ReverseDefCompose); + return ValueTrackerResult(RegSeqInput.Reg, ComposedDefInSrcReg1); + } + // If the subreg we are tracking is super-defined by another subreg, // we could follow this value. However, this would require to compose // the subreg and we do not do that for now. diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp index a1f441ebd0d5e..2369615ef0fb6 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp @@ -9,12 +9,14 @@ // Implementation of the default eviction advisor and of the Analysis pass. // //===----------------------------------------------------------------------===// - -#include "RegAllocEvictionAdvisor.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "AllocationOrder.h" #include "RegAllocGreedy.h" +#include "RegAllocPriorityAdvisor.h" #include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/IR/Module.h" @@ -26,17 +28,18 @@ using namespace llvm; -static cl::opt Mode( +static cl::opt Mode( "regalloc-enable-advisor", cl::Hidden, - cl::init(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default), + cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values( - clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default, + clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), - clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release, + clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), - clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development, - "development", "for training"))); + clEnumValN( + RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, + "development", "for training"))); static cl::opt EnableLocalReassignment( "enable-local-reassign", cl::Hidden, @@ -59,59 +62,112 @@ cl::opt EvictInterferenceCutoff( #define LLVM_HAVE_TF_AOT #endif -char RegAllocEvictionAdvisorAnalysis::ID = 0; -INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysis, "regalloc-evict", +char RegAllocEvictionAdvisorAnalysisLegacy::ID = 0; +INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysisLegacy, "regalloc-evict", "Regalloc eviction policy", false, true) namespace { -class DefaultEvictionAdvisorAnalysis final - : public RegAllocEvictionAdvisorAnalysis { +class DefaultEvictionAdvisorProvider final + : public RegAllocEvictionAdvisorProvider { public: - DefaultEvictionAdvisorAnalysis(bool NotAsRequested) - : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Default), - NotAsRequested(NotAsRequested) {} + DefaultEvictionAdvisorProvider(bool NotAsRequested, LLVMContext &Ctx) + : RegAllocEvictionAdvisorProvider(AdvisorMode::Default, Ctx) { + if (NotAsRequested) + Ctx.emitError("Requested regalloc eviction advisor analysis " + "could not be created. Using default"); + } // support for isa<> and dyn_cast. - static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { + static bool classof(const RegAllocEvictionAdvisorProvider *R) { return R->getAdvisorMode() == AdvisorMode::Default; } -private: std::unique_ptr - getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { + getAdvisor(const MachineFunction &MF, const RAGreedy &RA, + MachineBlockFrequencyInfo *, MachineLoopInfo *) override { return std::make_unique(MF, RA); } +}; + +class DefaultEvictionAdvisorAnalysisLegacy final + : public RegAllocEvictionAdvisorAnalysisLegacy { +public: + DefaultEvictionAdvisorAnalysisLegacy(bool NotAsRequested) + : RegAllocEvictionAdvisorAnalysisLegacy(AdvisorMode::Default), + NotAsRequested(NotAsRequested) {} + bool doInitialization(Module &M) override { - if (NotAsRequested) - M.getContext().emitError("Requested regalloc eviction advisor analysis " - "could not be created. Using default"); - return RegAllocEvictionAdvisorAnalysis::doInitialization(M); + Provider.reset( + new DefaultEvictionAdvisorProvider(NotAsRequested, M.getContext())); + return false; + } + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorAnalysisLegacy *R) { + return R->getAdvisorMode() == AdvisorMode::Default; } + +private: const bool NotAsRequested; }; } // namespace -template <> Pass *llvm::callDefaultCtor() { - Pass *Ret = nullptr; +AnalysisKey RegAllocEvictionAdvisorAnalysis::Key; + +void RegAllocEvictionAdvisorAnalysis::initializeProvider( + RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode Mode, LLVMContext &Ctx) { + if (Provider) + return; + switch (Mode) { + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default: + Provider.reset( + new DefaultEvictionAdvisorProvider(/*NotAsRequested=*/false, Ctx)); + return; + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development: +#if defined(LLVM_HAVE_TFLITE) + Provider.reset(createDevelopmentModeAdvisorProvider(Ctx)); +#else + Provider.reset( + new DefaultEvictionAdvisorProvider(/*NotAsRequested=*/true, Ctx)); +#endif + return; + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release: + Provider.reset(createReleaseModeAdvisorProvider(Ctx)); + return; + } +} + +RegAllocEvictionAdvisorAnalysis::Result +RegAllocEvictionAdvisorAnalysis::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + // Lazy initialization of the provider. + initializeProvider(::Mode, MF.getFunction().getContext()); + return Result{Provider.get()}; +} + +template <> +Pass *llvm::callDefaultCtor() { switch (Mode) { - case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default: - Ret = new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ false); - break; - case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development: + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default: + return new DefaultEvictionAdvisorAnalysisLegacy(/*NotAsRequested=*/false); + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release: { + Pass *Ret = createReleaseModeAdvisorAnalysisLegacy(); + // release mode advisor may not be supported + if (Ret) + return Ret; + return new DefaultEvictionAdvisorAnalysisLegacy(/*NotAsRequested=*/true); + } + case RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development: #if defined(LLVM_HAVE_TFLITE) - Ret = createDevelopmentModeAdvisor(); + return createDevelopmentModeAdvisorAnalysisLegacy(); +#else + return new DefaultEvictionAdvisorAnalysisLegacy(/*NotAsRequested=*/true); #endif - break; - case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release: - Ret = createReleaseModeAdvisor(); - break; } - if (Ret) - return Ret; - return new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ true); + llvm_unreachable("unexpected advisor mode"); } -StringRef RegAllocEvictionAdvisorAnalysis::getPassName() const { +StringRef RegAllocEvictionAdvisorAnalysisLegacy::getPassName() const { switch (getAdvisorMode()) { case AdvisorMode::Default: return "Default Regalloc Eviction Advisor"; diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 7c7eb2ad52b41..9318c1df0b5e2 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -15,7 +15,6 @@ #include "AllocationOrder.h" #include "InterferenceCache.h" #include "RegAllocBase.h" -#include "RegAllocEvictionAdvisor.h" #include "RegAllocPriorityAdvisor.h" #include "SplitKit.h" #include "llvm/ADT/ArrayRef.h" @@ -46,6 +45,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/SlotIndexes.h" @@ -164,7 +164,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(EdgeBundlesWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(SpillPlacementWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) -INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis) +INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysisLegacy) INITIALIZE_PASS_DEPENDENCY(RegAllocPriorityAdvisorAnalysis) INITIALIZE_PASS_END(RAGreedy, "greedy", "Greedy Register Allocator", false, false) @@ -219,7 +219,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -2765,8 +2765,11 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { : TRI->reverseLocalAssignment(); ExtraInfo.emplace(); - EvictAdvisor = - getAnalysis().getAdvisor(*MF, *this); + + auto &EvictAdvisorProvider = + getAnalysis().getProvider(); + EvictAdvisor = EvictAdvisorProvider.getAdvisor(*MF, *this, MBFI, Loops); + PriorityAdvisor = getAnalysis().getAdvisor(*MF, *this); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h index e1ec63b4a5296..1d55a8241d760 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.h +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -14,7 +14,6 @@ #include "InterferenceCache.h" #include "RegAllocBase.h" -#include "RegAllocEvictionAdvisor.h" #include "RegAllocPriorityAdvisor.h" #include "SplitKit.h" #include "llvm/ADT/ArrayRef.h" diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h index 32e4598b71539..0758743c2b140 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h +++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h @@ -9,7 +9,7 @@ #ifndef LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H #define LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H -#include "RegAllocEvictionAdvisor.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/Pass.h" diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c6fd72b6b76f4..bc7cdf38dbc2a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -28446,7 +28446,11 @@ static SDValue takeInexpensiveLog2(SelectionDAG &DAG, const SDLoc &DL, EVT VT, return SDValue(); auto CastToVT = [&](EVT NewVT, SDValue ToCast) { - ToCast = PeekThroughCastsAndTrunc(ToCast); + // Peek through zero extend. We can't peek through truncates since this + // function is called on a shift amount. We must ensure that all of the bits + // above the original shift amount are zeroed by this function. + while (ToCast.getOpcode() == ISD::ZERO_EXTEND) + ToCast = ToCast.getOperand(0); EVT CurVT = ToCast.getValueType(); if (NewVT == CurVT) return ToCast; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a0f29496df777..204b323d7084a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -159,6 +159,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(N); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = PromoteIntRes_PARTIAL_REDUCE_MLA(N); + break; + case ISD::SIGN_EXTEND: case ISD::VP_SIGN_EXTEND: case ISD::ZERO_EXTEND: @@ -2099,6 +2104,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::VECTOR_FIND_LAST_ACTIVE: Res = PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(N, OpNo); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = PromoteIntOp_PARTIAL_REDUCE_MLA(N); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -2881,6 +2890,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N) { + SmallVector NewOps(N->ops()); + if (N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA) { + NewOps[1] = SExtPromotedInteger(N->getOperand(1)); + NewOps[2] = SExtPromotedInteger(N->getOperand(2)); + } else { + NewOps[1] = ZExtPromotedInteger(N->getOperand(1)); + NewOps[2] = ZExtPromotedInteger(N->getOperand(2)); + } + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); +} + //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// @@ -6200,6 +6221,15 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N) { return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, SDLoc(N), NVT, N->ops()); } +SDValue DAGTypeLegalizer::PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue ExtAcc = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(N->getOpcode(), DL, NVT, ExtAcc, N->getOperand(1), + N->getOperand(2)); +} + SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b58c160b5c8b8..69c687a797485 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -379,6 +379,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_IS_FPCLASS(SDNode *N); SDValue PromoteIntRes_PATCHPOINT(SDNode *N); SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); + SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -430,6 +431,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N); void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -969,6 +971,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, SDValue &Hi); // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); @@ -1000,6 +1003,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N); SDValue SplitVecOp_VP_CttzElements(SDNode *N); SDValue SplitVecOp_VECTOR_HISTOGRAM(SDNode *N); + SDValue SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Support: LegalizeVectorTypes.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 7e8bae4b0f785..de4447fb0cf1a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -469,6 +469,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; case ISD::SMULFIX: @@ -1197,6 +1199,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::VECREDUCE_FMINIMUM: Results.push_back(TLI.expandVecReduce(Node, DAG)); return; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Results.push_back(TLI.expandPartialReduceMLA(Node, DAG)); + return; case ISD::VECREDUCE_SEQ_FADD: case ISD::VECREDUCE_SEQ_FMUL: Results.push_back(TLI.expandVecReduceSeq(Node, DAG)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1d8bf5427156e..9d42ec2fdf859 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1395,6 +1395,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::EXPERIMENTAL_VP_REVERSE: SplitVecRes_VP_REVERSE(N, Lo, Hi); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -3213,6 +3217,13 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo, std::tie(Lo, Hi) = DAG.SplitVector(Load, DL); } +void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc DL(N); + SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG); + std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL); +} + void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) { unsigned Factor = N->getNumOperands(); @@ -3431,6 +3442,10 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: Res = SplitVecOp_VECTOR_HISTOGRAM(N); break; + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: + Res = SplitVecOp_PARTIAL_REDUCE_MLA(N); + break; } // If the result is null, the sub-method took care of registering results etc. @@ -4485,6 +4500,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) { MMO, IndexType); } +SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) { + return TLI.expandPartialReduceMLA(N, DAG); +} + //===----------------------------------------------------------------------===// // Result Vector Widening //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9d2f87497d6fa..80c2de1d99542 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2474,35 +2474,6 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) { return getZExtOrTrunc(Op, SDLoc(Op), ShTy); } -SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1, - SDValue Op2) { - EVT FullTy = Op2.getValueType(); - - unsigned Stride = ReducedTy.getVectorMinNumElements(); - unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride; - - // Collect all of the subvectors - std::deque Subvectors = {Op1}; - for (unsigned I = 0; I < ScaleFactor; I++) { - auto SourceIndex = getVectorIdxConstant(I * Stride, DL); - Subvectors.push_back( - getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Op2, SourceIndex})); - } - - // Flatten the subvector tree - while (Subvectors.size() > 1) { - Subvectors.push_back( - getNode(ISD::ADD, DL, ReducedTy, {Subvectors[0], Subvectors[1]})); - Subvectors.pop_front(); - Subvectors.pop_front(); - } - - assert(Subvectors.size() == 1 && - "There should only be one subvector after tree flattening"); - - return Subvectors[0]; -} - /// Given a store node \p StoreNode, return true if it is safe to fold that node /// into \p FPNode, which expands to a library call with output pointers. static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode, @@ -7883,6 +7854,28 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; } + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SMLA: { + [[maybe_unused]] EVT AccVT = N1.getValueType(); + [[maybe_unused]] EVT Input1VT = N2.getValueType(); + [[maybe_unused]] EVT Input2VT = N3.getValueType(); + assert(Input1VT.isVector() && Input1VT == Input2VT && + "Expected the second and third operands of the PARTIAL_REDUCE_MLA " + "node to have the same type!"); + assert(VT.isVector() && VT == AccVT && + "Expected the first operand of the PARTIAL_REDUCE_MLA node to have " + "the same type as its result!"); + assert(Input1VT.getVectorElementCount().hasKnownScalarFactor( + AccVT.getVectorElementCount()) && + "Expected the element count of the second and third operands of the " + "PARTIAL_REDUCE_MLA node to be a positive integer multiple of the " + "element count of the first operand and the result!"); + assert(N2.getScalarValueSizeInBits() <= N1.getScalarValueSizeInBits() && + "Expected the second and third operands of the PARTIAL_REDUCE_MLA " + "node to have an element type which is the same as or smaller than " + "the element type of the first operand and result!"); + break; + } } // Memoize node if it doesn't produce a glue result. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 78a6e24e5b8d2..1c58a7f05446c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8115,15 +8115,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::experimental_vector_partial_reduce_add: { - if (!TLI.shouldExpandPartialReductionIntrinsic(cast(&I))) { visitTargetIntrinsic(I, Intrinsic); return; } - - setValue(&I, DAG.getPartialReduceAdd(sdl, EVT::getEVT(I.getType()), - getValue(I.getOperand(0)), - getValue(I.getOperand(1)))); + SDValue Acc = getValue(I.getOperand(0)); + SDValue Input = getValue(I.getOperand(1)); + setValue(&I, + DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, sdl, Acc.getValueType(), Acc, + Input, DAG.getConstant(1, sdl, Input.getValueType()))); return; } case Intrinsic::experimental_cttz_elts: { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 8de537173e52c..8457bee3f665b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -569,6 +569,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::VECTOR_FIND_LAST_ACTIVE: return "find_last_active"; + case ISD::PARTIAL_REDUCE_UMLA: + return "partial_reduce_umla"; + case ISD::PARTIAL_REDUCE_SMLA: + return "partial_reduce_smla"; + // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ case ISD::SDID: \ diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index adfb96041c5c0..7771958f5adc9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include +#include using namespace llvm; /// NOTE: The TargetMachine owns TLOF. @@ -11890,6 +11891,57 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node, return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); } +SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, + SelectionDAG &DAG) const { + SDLoc DL(N); + SDValue Acc = N->getOperand(0); + SDValue MulLHS = N->getOperand(1); + SDValue MulRHS = N->getOperand(2); + EVT AccVT = Acc.getValueType(); + EVT MulOpVT = MulLHS.getValueType(); + + EVT ExtMulOpVT = + EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(), + MulOpVT.getVectorElementCount()); + unsigned ExtOpc = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA + ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; + + if (ExtMulOpVT != MulOpVT) { + MulLHS = DAG.getNode(ExtOpc, DL, ExtMulOpVT, MulLHS); + MulRHS = DAG.getNode(ExtOpc, DL, ExtMulOpVT, MulRHS); + } + SDValue Input = MulLHS; + APInt ConstantOne; + if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) || + !ConstantOne.isOne()) + Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS); + + unsigned Stride = AccVT.getVectorMinNumElements(); + unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride; + + // Collect all of the subvectors + std::deque Subvectors = {Acc}; + for (unsigned I = 0; I < ScaleFactor; I++) { + auto SourceIndex = DAG.getVectorIdxConstant(I * Stride, DL); + Subvectors.push_back( + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, AccVT, {Input, SourceIndex})); + } + + // Flatten the subvector tree + while (Subvectors.size() > 1) { + Subvectors.push_back( + DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]})); + Subvectors.pop_front(); + Subvectors.pop_front(); + } + + assert(Subvectors.size() == 1 && + "There should only be one subvector after tree flattening"); + + return Subvectors[0]; +} + bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC, SDValue Mask, diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 51cde7ce139e2..f5ea3c0b47d6a 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -835,6 +835,10 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); setOperationAction(ISD::RESET_FPENV, VT, Expand); + + // PartialReduceMLA operations default to expand. + setOperationAction({ISD::PARTIAL_REDUCE_UMLA, ISD::PARTIAL_REDUCE_SMLA}, VT, + Expand); } // Most targets ignore the @llvm.prefetch intrinsic. diff --git a/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp b/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp index 06c545d62d76a..b5b380971d204 100644 --- a/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.cpp @@ -47,7 +47,10 @@ void JITLinkRedirectableSymbolManager::emitRedirectableSymbols( Ptr.setScope(jitlink::Scope::Hidden); auto &Stub = PtrJumpStubCreator(*G, StubsSection, Ptr); Stub.setName(Name); - Stub.setScope(jitlink::Scope::Default); + Stub.setScope(Def.getFlags().isExported() ? jitlink::Scope::Default + : jitlink::Scope::Hidden); + Stub.setLinkage(!Def.getFlags().isWeak() ? jitlink::Linkage::Strong + : jitlink::Linkage::Weak); NewSymbols[std::move(PtrName)] = JITSymbolFlags(); } diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp index 7b38621eba824..80f2a1304dde7 100644 --- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp @@ -387,7 +387,7 @@ void LazyReexportsManager::emitRedirectableSymbols( SymbolMap Redirs; size_t I = 0; for (auto &[Name, AI] : Reexports) - Redirs[Name] = (*ReentryPoints)[I++]; + Redirs[Name] = {(*ReentryPoints)[I++].getAddress(), AI.AliasFlags}; I = 0; if (!Reexports.empty()) { diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 04acab1e5765e..7ba23b0bd377e 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -7099,10 +7099,11 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, Function *KernelLaunchFunction = StaleCI->getCalledFunction(); // StaleCI is the CallInst which is the call to the outlined - // target kernel launch function. If there are values that the - // outlined function uses then these are aggregated into a structure - // which is passed as the second argument. If not, then there's - // only one argument, the threadID. So, StaleCI can be + // target kernel launch function. If there are local live-in values + // that the outlined function uses then these are aggregated into a structure + // which is passed as the second argument. If there are no local live-in + // values or if all values used by the outlined kernel are global variables, + // then there's only one argument, the threadID. So, StaleCI can be // // %structArg = alloca { ptr, ptr }, align 8 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0 @@ -7140,6 +7141,8 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, // host and device. assert((!HasShareds || (StaleCI->arg_size() == 2)) && "StaleCI with shareds should have exactly two arguments."); + + Value *ThreadId = ProxyFn->getArg(0); if (HasShareds) { auto *ArgStructAlloca = dyn_cast(StaleCI->getArgOperand(1)); assert(ArgStructAlloca && @@ -7150,7 +7153,6 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, AllocaInst *NewArgStructAlloca = Builder.CreateAlloca(ArgStructType, nullptr, "structArg"); Value *TaskT = ProxyFn->getArg(1); - Value *ThreadId = ProxyFn->getArg(0); Value *SharedsSize = Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); @@ -7163,7 +7165,10 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize); Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca}); + } else { + Builder.CreateCall(KernelLaunchFunction, {ThreadId}); } + Builder.CreateRetVoid(); return ProxyFn; } @@ -7306,11 +7311,23 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask( Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false)); Builder.restoreIP(TargetTaskBodyIP); - if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP)) return Err; - OI.ExitBB = Builder.saveIP().getBlock(); + // The outliner (CodeExtractor) extract a sequence or vector of blocks that + // it is given. These blocks are enumerated by + // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock + // to be outside the region. In other words, OI.ExitBlock is expected to be + // the start of the region after the outlining. We used to set OI.ExitBlock + // to the InsertBlock after TaskBodyCB is done. This is fine in most cases + // except when the task body is a single basic block. In that case, + // OI.ExitBlock is set to the single task body block and will get left out of + // the outlining process. So, simply create a new empty block to which we + // uncoditionally branch from where TaskBodyCB left off + OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont"); + emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(), + /*IsFinished=*/true); + OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait, DeviceID](Function &OutlinedFn) mutable { assert(OutlinedFn.getNumUses() == 1 && diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index 41e40cdf365d2..e09c139db39c8 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -170,11 +170,10 @@ ConstantRange ConstantRange::makeExactICmpRegion(CmpInst::Predicate Pred, const APInt &C) { // Computes the exact range that is equal to both the constant ranges returned // by makeAllowedICmpRegion and makeSatisfyingICmpRegion. This is always true - // when RHS is a singleton such as an APInt and so the assert is valid. - // However for non-singleton RHS, for example ult [2,5) makeAllowedICmpRegion - // returns [0,4) but makeSatisfyICmpRegion returns [0,2). + // when RHS is a singleton such as an APInt. However for non-singleton RHS, + // for example ult [2,5) makeAllowedICmpRegion returns [0,4) but + // makeSatisfyICmpRegion returns [0,2). // - assert(makeAllowedICmpRegion(Pred, C) == makeSatisfyingICmpRegion(Pred, C)); return makeAllowedICmpRegion(Pred, C); } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 8d5c0b3c13e01..96939f89279c6 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -127,6 +127,7 @@ #include "llvm/CodeGen/PeepholeOptimizer.h" #include "llvm/CodeGen/PostRASchedulerList.h" #include "llvm/CodeGen/PreISelIntrinsicLowering.h" +#include "llvm/CodeGen/RegAllocEvictionAdvisor.h" #include "llvm/CodeGen/RegAllocFast.h" #include "llvm/CodeGen/RegUsageInfoCollector.h" #include "llvm/CodeGen/RegUsageInfoPropagate.h" diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4263be1098899..50be082777835 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -154,6 +154,13 @@ cl::opt EnableSVEGISel( cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false)); +// FIXME : This is a temporary flag, and is used to help transition to +// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD +// nodes. +static cl::opt EnablePartialReduceNodes( + "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden, + cl::desc("Use the new method of lowering partial reductions.")); + /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; @@ -2050,6 +2057,8 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic( const IntrinsicInst *I) const { if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add) return true; + if (EnablePartialReduceNodes) + return true; EVT VT = EVT::getEVT(I->getType()); auto Op1 = I->getOperand(1); @@ -11780,8 +11789,9 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (Align && *Align > MinSlotSize) { VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align->value() - 1, DL, PtrVT)); - VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, - DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT)); + VAList = + DAG.getNode(ISD::AND, DL, PtrVT, VAList, + DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); @@ -16147,8 +16157,9 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) - SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + SP = + DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); @@ -16185,7 +16196,7 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); @@ -16213,7 +16224,7 @@ AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT)); // Set the real SP to the new value with a probing loop. Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); @@ -21485,7 +21496,7 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { Op = DAG.getNode(Opcode, dl, VT, Op, - DAG.getConstant(-ShiftAmount, dl, MVT::i32)); + DAG.getSignedConstant(-ShiftAmount, dl, MVT::i32)); if (N->getValueType(0) == MVT::i64) Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op, DAG.getConstant(0, dl, MVT::i64)); @@ -21976,8 +21987,11 @@ static SDValue performIntrinsicCombine(SDNode *N, return Dot; if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG)) return WideAdd; - return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2)); + SDLoc DL(N); + SDValue Input = N->getOperand(2); + return DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, DL, N->getValueType(0), + N->getOperand(1), Input, + DAG.getConstant(1, DL, Input.getValueType())); } case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: @@ -25070,10 +25084,10 @@ static SDValue performSETCCCombine(SDNode *N, // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne if (Cond == ISD::SETNE && isNullConstant(RHS) && LHS->getOpcode() == ISD::SRL && isa(LHS->getOperand(1)) && - LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() && LHS->hasOneUse()) { EVT TstVT = LHS->getValueType(0); - if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) { + if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 && + LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) { // this pattern will get better opt in emitComparison uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1); SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0), @@ -27364,10 +27378,10 @@ static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SDLoc dl(Val128); Val2x64.first = DAG.getNode(ISD::XOR, dl, MVT::i64, - DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first); + DAG.getAllOnesConstant(dl, MVT::i64), Val2x64.first); Val2x64.second = DAG.getNode(ISD::XOR, dl, MVT::i64, - DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second); + DAG.getAllOnesConstant(dl, MVT::i64), Val2x64.second); } SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain}; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c9549f12769d1..93a6100ce54e9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -335,8 +335,6 @@ def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">; def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">; -def HasFPAC : Predicate<"Subtarget->hasFPAC())">, - AssemblerPredicateWithAll<(all_of FeatureFPAC), "fpac">; def HasXS : Predicate<"Subtarget->hasXS()">, AssemblerPredicateWithAll<(all_of FeatureXS), "xs">; def HasWFxT : Predicate<"Subtarget->hasWFxT()">, diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index d1d4986d12550..b977b6aaaf619 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -1224,6 +1224,9 @@ def : ProcessorAlias<"apple-s5", "apple-a12">; def : ProcessorModel<"apple-a13", CycloneModel, ProcessorFeatures.AppleA13, [TuneAppleA13]>; +def : ProcessorAlias<"apple-s6", "apple-a13">; +def : ProcessorAlias<"apple-s7", "apple-a13">; +def : ProcessorAlias<"apple-s8", "apple-a13">; def : ProcessorModel<"apple-a14", CycloneModel, ProcessorFeatures.AppleA14, [TuneAppleA14]>; @@ -1236,12 +1239,15 @@ def : ProcessorAlias<"apple-m2", "apple-a15">; def : ProcessorModel<"apple-a16", CycloneModel, ProcessorFeatures.AppleA16, [TuneAppleA16]>; def : ProcessorAlias<"apple-m3", "apple-a16">; +def : ProcessorAlias<"apple-s9", "apple-a16">; +def : ProcessorAlias<"apple-s10", "apple-a16">; def : ProcessorModel<"apple-a17", CycloneModel, ProcessorFeatures.AppleA17, [TuneAppleA17]>; def : ProcessorModel<"apple-m4", CycloneModel, ProcessorFeatures.AppleM4, [TuneAppleM4]>; +def : ProcessorAlias<"apple-a18", "apple-m4">; // Alias for the latest Apple processor model supported by LLVM. def : ProcessorAlias<"apple-latest", "apple-m4">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4a0e5ef58ac93..42392e22643b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -448,7 +448,7 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringLegacyPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringLegacyID; -void initializeGCNNSAReassignPass(PassRegistry &); +void initializeGCNNSAReassignLegacyPass(PassRegistry &); extern char &GCNNSAReassignID; void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 031d8f0560ff2..a8d0bb746d2ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -748,7 +748,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal) ->getVariableValue(), - getFunctionCodeSize(MF), MFI); + CurrentProgramInfo.getFunctionCodeSize(MF), MFI); return false; } @@ -757,7 +757,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { CurrentProgramInfo.NumArchVGPR, STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr, CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, - CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); + CurrentProgramInfo.ScratchSize, + CurrentProgramInfo.getFunctionCodeSize(MF), MFI); OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); @@ -893,27 +894,6 @@ void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { } } -uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { - const GCNSubtarget &STM = MF.getSubtarget(); - const SIInstrInfo *TII = STM.getInstrInfo(); - - uint64_t CodeSize = 0; - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - // TODO: CodeSize should account for multiple functions. - - // TODO: Should we count size of debug info? - if (MI.isDebugInstr()) - continue; - - CodeSize += TII->getInstSizeInBytes(MI); - } - } - - return CodeSize; -} - // AccumOffset computed for the MCExpr equivalent of: // alignTo(std::max(1, NumVGPR), 4) / 4 - 1; static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index cc8c4411805e2..2c959d7dbbd07 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -50,8 +50,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter { MCCodeEmitter *DumpCodeInstEmitter = nullptr; - uint64_t getFunctionCodeSize(const MachineFunction &MF) const; - void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out, const SIProgramInfo &KernelInfo, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 908d323c7fec9..649deee346e90 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2426,11 +2426,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( return true; } - DiagnosticInfoUnsupported InvalidAddrSpaceCast( - MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); - - LLVMContext &Ctx = MF.getFunction().getContext(); - Ctx.diagnose(InvalidAddrSpaceCast); + // Invalid casts are poison. + // TODO: Should return poison B.buildUndef(Dst); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index e9d009baa20af..09412d1b0f1cc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -132,6 +132,7 @@ class PreloadKernelArgInfo { NF->setAttributes(AL); F.replaceAllUsesWith(NF); F.setCallingConv(CallingConv::C); + F.clearMetadata(); return NF; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 225f84725874b..fd1341e8c91b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -100,6 +100,7 @@ MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass()) MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) +MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) @@ -120,7 +121,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) #define DUMMY_MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass()) -DUMMY_MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index eb488843b53e0..7c9377e61230b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -32,6 +32,7 @@ #include "AMDGPUWaitSGPRHazards.h" #include "GCNDPPCombine.h" #include "GCNIterativeScheduler.h" +#include "GCNNSAReassign.h" #include "GCNPreRALongBranchReg.h" #include "GCNPreRAOptimizations.h" #include "GCNRewritePartialRegUses.h" @@ -550,7 +551,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUImageIntrinsicOptimizerPass(*PR); initializeAMDGPUPrintfRuntimeBindingPass(*PR); initializeAMDGPUResourceUsageAnalysisPass(*PR); - initializeGCNNSAReassignPass(*PR); + initializeGCNNSAReassignLegacyPass(*PR); initializeGCNPreRAOptimizationsLegacyPass(*PR); initializeGCNPreRALongBranchRegLegacyPass(*PR); initializeGCNRewritePartialRegUsesLegacyPass(*PR); @@ -1151,6 +1152,7 @@ class GCNPassConfig final : public AMDGPUPassConfig { void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + void addPostBBSections() override; }; } // end anonymous namespace @@ -1690,6 +1692,11 @@ void GCNPassConfig::addPreEmitPass() { addPass(&AMDGPUInsertDelayAluID); addPass(&BranchRelaxationPassID); +} + +void GCNPassConfig::addPostBBSections() { + // We run this later to avoid passes like livedebugvalues and BBSections + // having to deal with the apparent multi-entry functions we may generate. addPass(createAMDGPUPreloadKernArgPrologLegacyPass()); } @@ -2106,6 +2113,12 @@ Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const { return Error::success(); } +void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const { + if (EnableRegReassign) { + addPass(GCNNSAReassignPass()); + } +} + void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization( AddMachinePass &addPass) const { Base::addMachineSSAOptimization(addPass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 1455494d0ef7d..eb5a9ca1f86d6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -177,6 +177,7 @@ class AMDGPUCodeGenPassBuilder void addILPOpts(AddMachinePass &) const; void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const; Error addInstSelector(AddMachinePass &) const; + void addPreRewrite(AddMachinePass &) const; void addMachineSSAOptimization(AddMachinePass &) const; void addPostRegAlloc(AddMachinePass &) const; diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 85e79aa4b7595..13eb0ca539a4c 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -13,6 +13,7 @@ /// //===----------------------------------------------------------------------===// +#include "GCNNSAReassign.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" @@ -34,26 +35,12 @@ STATISTIC(NumNSAConverted, "Number of NSA instructions changed to sequential"); namespace { - -class GCNNSAReassign : public MachineFunctionPass { +class GCNNSAReassignImpl { public: - static char ID; - - GCNNSAReassign() : MachineFunctionPass(ID) { - initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; + GCNNSAReassignImpl(VirtRegMap *VM, LiveRegMatrix *LM, LiveIntervals *LS) + : VRM(VM), LRM(LM), LIS(LS) {} - StringRef getPassName() const override { return "GCN NSA Reassign"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } + bool run(MachineFunction &MF); private: using NSA_Status = enum { @@ -90,24 +77,43 @@ class GCNNSAReassign : public MachineFunctionPass { bool scavengeRegs(SmallVectorImpl &Intervals) const; }; +class GCNNSAReassignLegacy : public MachineFunctionPass { +public: + static char ID; + + GCNNSAReassignLegacy() : MachineFunctionPass(ID) { + initializeGCNNSAReassignLegacyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN NSA Reassign"; }; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", +INITIALIZE_PASS_BEGIN(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy) INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy) -INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", - false, false) - +INITIALIZE_PASS_END(GCNNSAReassignLegacy, DEBUG_TYPE, "GCN NSA Reassign", false, + false) -char GCNNSAReassign::ID = 0; +char GCNNSAReassignLegacy::ID = 0; -char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; +char &llvm::GCNNSAReassignID = GCNNSAReassignLegacy::ID; -bool -GCNNSAReassign::tryAssignRegisters(SmallVectorImpl &Intervals, - unsigned StartReg) const { +bool GCNNSAReassignImpl::tryAssignRegisters( + SmallVectorImpl &Intervals, unsigned StartReg) const { unsigned NumRegs = Intervals.size(); for (unsigned N = 0; N < NumRegs; ++N) @@ -124,7 +130,7 @@ GCNNSAReassign::tryAssignRegisters(SmallVectorImpl &Intervals, return true; } -bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { +bool GCNNSAReassignImpl::canAssign(unsigned StartReg, unsigned NumRegs) const { for (unsigned N = 0; N < NumRegs; ++N) { unsigned Reg = StartReg + N; if (!MRI->isAllocatable(Reg)) @@ -139,8 +145,8 @@ bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { return true; } -bool -GCNNSAReassign::scavengeRegs(SmallVectorImpl &Intervals) const { +bool GCNNSAReassignImpl::scavengeRegs( + SmallVectorImpl &Intervals) const { unsigned NumRegs = Intervals.size(); if (NumRegs > MaxNumVGPRs) @@ -158,8 +164,8 @@ GCNNSAReassign::scavengeRegs(SmallVectorImpl &Intervals) const { return false; } -GCNNSAReassign::NSA_Status -GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { +GCNNSAReassignImpl::NSA_Status +GCNNSAReassignImpl::CheckNSA(const MachineInstr &MI, bool Fast) const { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); if (!Info) return NSA_Status::NOT_NSA; @@ -235,16 +241,13 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; } -bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { +bool GCNNSAReassignImpl::run(MachineFunction &MF) { ST = &MF.getSubtarget(); if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding()) return false; MRI = &MF.getRegInfo(); TRI = ST->getRegisterInfo(); - VRM = &getAnalysis().getVRM(); - LRM = &getAnalysis().getLRM(); - LIS = &getAnalysis().getLIS(); const SIMachineFunctionInfo *MFI = MF.getInfo(); MaxNumVGPRs = ST->getMaxNumVGPRs(MF); @@ -367,3 +370,24 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { return Changed; } + +bool GCNNSAReassignLegacy::runOnMachineFunction(MachineFunction &MF) { + auto *VRM = &getAnalysis().getVRM(); + auto *LRM = &getAnalysis().getLRM(); + auto *LIS = &getAnalysis().getLIS(); + + GCNNSAReassignImpl Impl(VRM, LRM, LIS); + return Impl.run(MF); +} + +PreservedAnalyses +GCNNSAReassignPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto &VRM = MFAM.getResult(MF); + auto &LRM = MFAM.getResult(MF); + auto &LIS = MFAM.getResult(MF); + + GCNNSAReassignImpl Impl(&VRM, &LRM, &LIS); + Impl.run(MF); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.h b/llvm/lib/Target/AMDGPU/GCNNSAReassign.h new file mode 100644 index 0000000000000..97a72e7ddbb24 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.h @@ -0,0 +1,22 @@ +//===- GCNNSAReassign.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNNSAREASSIGN_H +#define LLVM_LIB_TARGET_AMDGPU_GCNNSAREASSIGN_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { +class GCNNSAReassignPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNNSAREASSIGN_H diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index d8f3f9c54abc1..ab396929162d0 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -123,7 +123,7 @@ class SIFoldOperandsImpl { SmallVectorImpl &FoldList, SmallVectorImpl &CopiesToReplace) const; - MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const; + std::optional getImmOrMaterializedImm(MachineOperand &Op) const; bool tryConstantFoldOp(MachineInstr *MI) const; bool tryFoldCndMask(MachineInstr &MI) const; bool tryFoldZeroHighBits(MachineInstr &MI) const; @@ -166,6 +166,11 @@ class SIFoldOperandsLegacy : public MachineFunctionPass { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } }; } // End anonymous namespace. @@ -819,8 +824,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType; - if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && - TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { + if (OpToFold.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); return true; } @@ -840,8 +844,7 @@ bool SIFoldOperandsImpl::tryToFoldACImm( MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) { MachineOperand &DefOp = Def->getOperand(1); - if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) && - TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { + if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm()); return true; } @@ -1293,21 +1296,22 @@ static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) { MI.removeOperand(I); } -MachineOperand * +std::optional SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const { - // If this has a subregister, it obviously is a register source. - if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister || - !Op.getReg().isVirtual()) - return &Op; + if (Op.isImm()) + return Op.getImm(); - MachineInstr *Def = MRI->getVRegDef(Op.getReg()); + if (!Op.isReg() || !Op.getReg().isVirtual()) + return std::nullopt; + + const MachineInstr *Def = MRI->getVRegDef(Op.getReg()); if (Def && Def->isMoveImmediate()) { - MachineOperand &ImmSrc = Def->getOperand(1); + const MachineOperand &ImmSrc = Def->getOperand(1); if (ImmSrc.isImm()) - return &ImmSrc; + return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg()); } - return &Op; + return std::nullopt; } // Try to simplify operations with a constant that may appear after instruction @@ -1322,12 +1326,14 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return false; - MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx)); + + MachineOperand *Src0 = &MI->getOperand(Src0Idx); + std::optional Src0Imm = getImmOrMaterializedImm(*Src0); if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 || Opc == AMDGPU::S_NOT_B32) && - Src0->isImm()) { - MI->getOperand(1).ChangeToImmediate(~Src0->getImm()); + Src0Imm) { + MI->getOperand(1).ChangeToImmediate(~*Src0Imm); mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32))); return true; } @@ -1335,17 +1341,19 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx)); - if (!Src0->isImm() && !Src1->isImm()) + MachineOperand *Src1 = &MI->getOperand(Src1Idx); + std::optional Src1Imm = getImmOrMaterializedImm(*Src1); + + if (!Src0Imm && !Src1Imm) return false; // and k0, k1 -> v_mov_b32 (k0 & k1) // or k0, k1 -> v_mov_b32 (k0 | k1) // xor k0, k1 -> v_mov_b32 (k0 ^ k1) - if (Src0->isImm() && Src1->isImm()) { + if (Src0Imm && Src1Imm) { int32_t NewImm; - if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm())) + if (!evalBinaryInstruction(Opc, NewImm, *Src0Imm, *Src1Imm)) return false; bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg()); @@ -1361,12 +1369,13 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const { if (!MI->isCommutable()) return false; - if (Src0->isImm() && !Src1->isImm()) { + if (Src0Imm && !Src1Imm) { std::swap(Src0, Src1); std::swap(Src0Idx, Src1Idx); + std::swap(Src0Imm, Src1Imm); } - int32_t Src1Val = static_cast(Src1->getImm()); + int32_t Src1Val = static_cast(*Src1Imm); if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::V_OR_B32_e32 || Opc == AMDGPU::S_OR_B32) { @@ -1423,9 +1432,12 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const { MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (!Src1->isIdenticalTo(*Src0)) { - auto *Src0Imm = getImmOrMaterializedImm(*Src0); - auto *Src1Imm = getImmOrMaterializedImm(*Src1); - if (!Src1Imm->isIdenticalTo(*Src0Imm)) + std::optional Src1Imm = getImmOrMaterializedImm(*Src1); + if (!Src1Imm) + return false; + + std::optional Src0Imm = getImmOrMaterializedImm(*Src0); + if (!Src0Imm || *Src0Imm != *Src1Imm) return false; } @@ -1458,8 +1470,8 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const { MI.getOpcode() != AMDGPU::V_AND_B32_e32) return false; - MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1)); - if (!Src0->isImm() || Src0->getImm() != 0xffff) + std::optional Src0Imm = getImmOrMaterializedImm(MI.getOperand(1)); + if (!Src0Imm || *Src0Imm != 0xffff) return false; Register Src1 = MI.getOperand(2).getReg(); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.h b/llvm/lib/Target/AMDGPU/SIFoldOperands.h index d6b8f6a729526..c419ec0911e20 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.h +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.h @@ -17,6 +17,11 @@ class SIFoldOperandsPass : public PassInfoMixin { SIFoldOperandsPass() = default; PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); + + MachineFunctionProperties getRequiredProperties() const { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::IsSSA); + } }; } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 62ee196cf8e17..e09b310d107ac 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7341,11 +7341,8 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, // global <-> flat are no-ops and never emitted. - const MachineFunction &MF = DAG.getMachineFunction(); - DiagnosticInfoUnsupported InvalidAddrSpaceCast( - MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); - DAG.getContext()->diagnose(InvalidAddrSpaceCast); - + // Invalid casts are poison. + // TODO: Should return poison return DAG.getUNDEF(Op->getValueType(0)); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8481c6333f479..ceab6c9dcca34 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1327,6 +1327,53 @@ Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, return Reg; } +bool SIInstrInfo::getConstValDefinedInReg(const MachineInstr &MI, + const Register Reg, + int64_t &ImmVal) const { + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOVK_I32: + case AMDGPU::S_MOV_B64: + case AMDGPU::V_MOV_B64_e32: + case AMDGPU::V_ACCVGPR_WRITE_B32_e64: + case AMDGPU::S_MOV_B64_IMM_PSEUDO: + case AMDGPU::V_MOV_B64_PSEUDO: { + const MachineOperand &Src0 = MI.getOperand(1); + if (Src0.isImm()) { + ImmVal = Src0.getImm(); + return MI.getOperand(0).getReg() == Reg; + } + + return false; + } + case AMDGPU::S_BREV_B32: + case AMDGPU::V_BFREV_B32_e32: + case AMDGPU::V_BFREV_B32_e64: { + const MachineOperand &Src0 = MI.getOperand(1); + if (Src0.isImm()) { + ImmVal = static_cast(reverseBits(Src0.getImm())); + return MI.getOperand(0).getReg() == Reg; + } + + return false; + } + case AMDGPU::S_NOT_B32: + case AMDGPU::V_NOT_B32_e32: + case AMDGPU::V_NOT_B32_e64: { + const MachineOperand &Src0 = MI.getOperand(1); + if (Src0.isImm()) { + ImmVal = static_cast(~static_cast(Src0.getImm())); + return MI.getOperand(0).getReg() == Reg; + } + + return false; + } + default: + return false; + } +} + unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (RI.isAGPRClass(DstRC)) @@ -3390,49 +3437,38 @@ void SIInstrInfo::removeModOperands(MachineInstr &MI) const { } } +std::optional SIInstrInfo::extractSubregFromImm(int64_t Imm, + unsigned SubRegIndex) { + switch (SubRegIndex) { + case AMDGPU::NoSubRegister: + return Imm; + case AMDGPU::sub0: + return Lo_32(Imm); + case AMDGPU::sub1: + return Hi_32(Imm); + case AMDGPU::lo16: + return SignExtend64<16>(Imm); + case AMDGPU::hi16: + return SignExtend64<16>(Imm >> 16); + case AMDGPU::sub1_lo16: + return SignExtend64<16>(Imm >> 32); + case AMDGPU::sub1_hi16: + return SignExtend64<16>(Imm >> 48); + default: + return std::nullopt; + } + + llvm_unreachable("covered subregister switch"); +} + bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; - switch (DefMI.getOpcode()) { - default: + int64_t Imm; + if (!getConstValDefinedInReg(DefMI, Reg, Imm)) return false; - case AMDGPU::V_MOV_B64_e32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B64_PSEUDO: - case AMDGPU::S_MOV_B64_IMM_PSEUDO: - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::S_MOV_B32: - case AMDGPU::V_ACCVGPR_WRITE_B32_e64: - break; - } - - const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); - assert(ImmOp); - // FIXME: We could handle FrameIndex values here. - if (!ImmOp->isImm()) - return false; - - auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t { - int64_t Imm = ImmOp->getImm(); - switch (UseOp.getSubReg()) { - default: - return Imm; - case AMDGPU::sub0: - return Lo_32(Imm); - case AMDGPU::sub1: - return Hi_32(Imm); - case AMDGPU::lo16: - return SignExtend64<16>(Imm); - case AMDGPU::hi16: - return SignExtend64<16>(Imm >> 16); - case AMDGPU::sub1_lo16: - return SignExtend64<16>(Imm >> 32); - case AMDGPU::sub1_hi16: - return SignExtend64<16>(Imm >> 48); - } - }; assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form"); @@ -3449,7 +3485,11 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, : AMDGPU::V_MOV_B32_e32 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::S_MOV_B32; - APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)), + + std::optional SubRegImm = + extractSubregFromImm(Imm, UseMI.getOperand(1).getSubReg()); + + APInt Imm(Is64Bit ? 64 : 32, *SubRegImm, /*isSigned=*/true, /*implicitTrunc=*/true); if (RI.isAGPR(*MRI, DstReg)) { @@ -3473,14 +3513,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, assert(UseMI.getOperand(1).getReg().isVirtual()); } + MachineFunction *MF = UseMI.getMF(); const MCInstrDesc &NewMCID = get(NewOpc); - if (DstReg.isPhysical() && - !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg)) + const TargetRegisterClass *NewDefRC = getRegClass(NewMCID, 0, &RI, *MF); + + if (DstReg.isPhysical()) { + if (!NewDefRC->contains(DstReg)) + return false; + } else if (!MRI->constrainRegClass(DstReg, NewDefRC)) return false; UseMI.setDesc(NewMCID); UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); - UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); + UseMI.addImplicitDefUseOperands(*MF); return true; } @@ -3497,12 +3542,14 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // If this is a free constant, there's no reason to do this. // TODO: We could fold this here instead of letting SIFoldOperands do it // later. - MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); + int Src0Idx = getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::src0); // Any src operand can be used for the legality check. - if (isInlineConstant(UseMI, *Src0, *ImmOp)) + if (isInlineConstant(UseMI, Src0Idx, Imm)) return false; + MachineOperand *Src0 = &UseMI.getOperand(Src0Idx); + bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; bool IsFMA = @@ -3553,7 +3600,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (NewOpc == AMDGPU::V_FMAMK_F16_fake16) return false; - const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1); + const std::optional SubRegImm = extractSubregFromImm( + Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg()); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. @@ -3570,7 +3618,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - Src1->ChangeToImmediate(Imm); + Src1->ChangeToImmediate(*SubRegImm); removeModOperands(UseMI); UseMI.setDesc(get(NewOpc)); @@ -3641,8 +3689,11 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + const std::optional SubRegImm = + extractSubregFromImm(Imm, Src2->getSubReg()); + // ChangingToImmediate adds Src2 back to the instruction. - Src2->ChangeToImmediate(getImmFor(*Src2)); + Src2->ChangeToImmediate(*SubRegImm); // These come before src2. removeModOperands(UseMI); @@ -4262,18 +4313,11 @@ bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const { } } -bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, - uint8_t OperandType) const { - assert(!MO.isReg() && "isInlineConstant called on register operand!"); - if (!MO.isImm()) - return false; - +bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const { // MachineOperand provides no way to tell the true operand size, since it only // records a 64-bit value. We need to know the size to determine if a 32-bit // floating point immediate bit pattern is legal for an integer immediate. It // would be for any 32-bit integer operand, but would not be for a 64-bit one. - - int64_t Imm = MO.getImm(); switch (OperandType) { case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: @@ -4295,8 +4339,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - return AMDGPU::isInlinableLiteral64(MO.getImm(), - ST.hasInv2PiInlineImm()); + return AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm()); case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: @@ -5888,11 +5931,17 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); + const MachineOperand *UsedLiteral = nullptr; + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { - if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--) - return false; + if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) { + if (!LiteralLimit--) + return false; + + UsedLiteral = MO; + } SmallDenseSet SGPRsUsed; if (MO->isReg()) @@ -5913,6 +5962,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, } } else if (AMDGPU::isSISrcOperand(InstDesc, i) && !isInlineConstant(Op, InstDesc.operands()[i])) { + // The same literal may be used multiple times. + if (!UsedLiteral) + UsedLiteral = &Op; + else if (UsedLiteral->isIdenticalTo(Op)) + continue; + if (!LiteralLimit--) return false; if (--ConstantBusLimit <= 0) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 811e4fcbebf57..79ecc2a657ed0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -278,6 +278,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const; + bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, + int64_t &ImmVal) const override; + void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, @@ -398,6 +401,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void removeModOperands(MachineInstr &MI) const; + /// Return the extracted immediate value in a subregister use from a constant + /// materialized in a super register. + /// + /// e.g. %imm = S_MOV_B64 K[0:63] + /// USE %imm.sub1 + /// This will return K[32:63] + static std::optional extractSubregFromImm(int64_t ImmVal, + unsigned SubRegIndex); + bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final; @@ -1063,7 +1075,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { // Some operands like FrameIndexes could resolve to an inline immediate value // that will not require an additional 4-bytes; this function assumes that it // will. - bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; + bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { + assert(!MO.isReg() && "isInlineConstant called on register operand!"); + if (!MO.isImm()) + return false; + return isInlineConstant(MO.getImm(), OperandType); + } + bool isInlineConstant(int64_t ImmVal, uint8_t OperandType) const; bool isInlineConstant(const MachineOperand &MO, const MCOperandInfo &OpInfo) const { @@ -1091,7 +1109,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { } bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx, - const MachineOperand &MO) const { + int64_t ImmVal) const { if (OpIdx >= MI.getDesc().NumOperands) return false; @@ -1101,10 +1119,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { uint8_t OpType = (Size == 8) ? AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32; - return isInlineConstant(MO, OpType); + return isInlineConstant(ImmVal, OpType); } - return isInlineConstant(MO, MI.getDesc().operands()[OpIdx].OperandType); + return isInlineConstant(ImmVal, MI.getDesc().operands()[OpIdx].OperandType); + } + + bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx, + const MachineOperand &MO) const { + return isInlineConstant(MI, OpIdx, MO.getImm()); } bool isInlineConstant(const MachineOperand &MO) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index bb78e77a9dc1a..4fd68b52b53bb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -913,7 +913,7 @@ class VGPRImm : PatLeafgetConstant(-N->getSExtValue(), SDLoc(N), MVT::i32); + return CurDAG->getSignedConstant(-N->getSExtValue(), SDLoc(N), MVT::i32); }]>; // TODO: When FP inline imm values work? diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 212edff097837..1123696509818 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -27,6 +27,8 @@ void SIProgramInfo::reset(const MachineFunction &MF) { const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx); + CodeSizeInBytes.reset(); + VGPRBlocks = ZeroExpr; SGPRBlocks = ZeroExpr; Priority = 0; @@ -199,3 +201,27 @@ const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, return MCConstantExpr::create(0, Ctx); } + +uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF) { + if (CodeSizeInBytes.has_value()) + return *CodeSizeInBytes; + + const GCNSubtarget &STM = MF.getSubtarget(); + const SIInstrInfo *TII = STM.getInstrInfo(); + + uint64_t CodeSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: CodeSize should account for multiple functions. + + if (MI.isMetaInstruction()) + continue; + + CodeSize += TII->getInstSizeInBytes(MI); + } + } + + CodeSizeInBytes = CodeSize; + return CodeSize; +} diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 37c03d9b637f0..d7087436ae758 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -19,6 +19,7 @@ #include "llvm/IR/CallingConv.h" #include "llvm/Support/Compiler.h" #include +#include namespace llvm { @@ -29,6 +30,8 @@ class MachineFunction; /// Track resource usage for kernels / entry functions. struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { + std::optional CodeSizeInBytes; + // Fields set in PGM_RSRC1 pm4 packet. const MCExpr *VGPRBlocks = nullptr; const MCExpr *SGPRBlocks = nullptr; @@ -97,6 +100,9 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo { // non-MCExpr members. void reset(const MachineFunction &MF); + // Get function code size and cache the value. + uint64_t getFunctionCodeSize(const MachineFunction &MF); + /// Compute the value of the ComputePGMRsrc1 register. const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const; diff --git a/llvm/lib/Target/ARC/ARCTargetStreamer.h b/llvm/lib/Target/ARC/MCTargetDesc/ARCTargetStreamer.h similarity index 100% rename from llvm/lib/Target/ARC/ARCTargetStreamer.h rename to llvm/lib/Target/ARC/MCTargetDesc/ARCTargetStreamer.h diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2bac1d0086041..eb1491feb611e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -20786,9 +20786,9 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); if (Align) - SP = - DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); + SP = DAG.getNode( + ISD::AND, DL, MVT::i32, SP.getValue(0), + DAG.getSignedConstant(-(uint64_t)Align->value(), DL, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); SDValue Ops[2] = { SP, Chain }; return DAG.getMergeValues(Ops, DL); diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 20881de1d94f4..f8c57fc5e0058 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// -#include "HexagonTargetStreamer.h" #include "MCTargetDesc/HexagonMCChecker.h" #include "MCTargetDesc/HexagonMCELFStreamer.h" #include "MCTargetDesc/HexagonMCExpr.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" #include "MCTargetDesc/HexagonShuffler.h" +#include "MCTargetDesc/HexagonTargetStreamer.h" #include "TargetInfo/HexagonTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index f10122fdacfcd..c6f250353f736 100644 --- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -16,11 +16,11 @@ #include "HexagonInstrInfo.h" #include "HexagonRegisterInfo.h" #include "HexagonSubtarget.h" -#include "HexagonTargetStreamer.h" #include "MCTargetDesc/HexagonInstPrinter.h" #include "MCTargetDesc/HexagonMCExpr.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "MCTargetDesc/HexagonTargetStreamer.h" #include "TargetInfo/HexagonTargetInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 10db4f552cdcf..c0baf301e0624 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -617,7 +617,8 @@ void HexagonDAGToDAGISel::SelectSHL(SDNode *N) { if (ConstantSDNode *C2 = dyn_cast(Shl2_1)) { int32_t ValConst = 1 << (ShlConst + C2->getSExtValue()); if (isInt<9>(-ValConst)) { - SDValue Val = CurDAG->getTargetConstant(-ValConst, dl, MVT::i32); + SDValue Val = + CurDAG->getSignedTargetConstant(-ValConst, dl, MVT::i32); SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_mpysmi, dl, MVT::i32, Shl2_0, Val); ReplaceNode(N, Result); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 1a7667fe42fbc..d66e3e306d2ff 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3273,7 +3273,7 @@ HexagonTargetLowering::LowerUAddSubO(SDValue Op, SelectionDAG &DAG) const { if (Opc == ISD::USUBO) { SDValue Op = DAG.getNode(ISD::SUB, dl, VTs.VTs[0], {X, Y}); SDValue Ov = DAG.getSetCC(dl, MVT::i1, Op, - DAG.getConstant(-1, dl, ty(Op)), ISD::SETEQ); + DAG.getAllOnesConstant(dl, ty(Op)), ISD::SETEQ); return DAG.getMergeValues({Op, Ov}, dl); } } @@ -3491,7 +3491,7 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N, SDValue P = Op.getOperand(0); switch (P.getOpcode()) { case HexagonISD::PTRUE: - return DCI.DAG.getConstant(-1, dl, ty(Op)); + return DCI.DAG.getAllOnesConstant(dl, ty(Op)); case HexagonISD::PFALSE: return getZero(dl, ty(Op), DCI.DAG); default: diff --git a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonTargetStreamer.h similarity index 100% rename from llvm/lib/Target/Hexagon/HexagonTargetStreamer.h rename to llvm/lib/Target/Hexagon/MCTargetDesc/HexagonTargetStreamer.h diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index d108564e128c0..8c328d5ed7234 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -11,7 +11,7 @@ #include "MCTargetDesc/MipsBaseInfo.h" #include "MCTargetDesc/MipsMCExpr.h" #include "MCTargetDesc/MipsMCTargetDesc.h" -#include "MipsTargetStreamer.h" +#include "MCTargetDesc/MipsTargetStreamer.h" #include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Target/Mips/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h similarity index 100% rename from llvm/lib/Target/Mips/MipsTargetStreamer.h rename to llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.h diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index b0b7b5dc7a31d..e06a9b36bfe4f 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -17,12 +17,12 @@ #include "MCTargetDesc/MipsInstPrinter.h" #include "MCTargetDesc/MipsMCNaCl.h" #include "MCTargetDesc/MipsMCTargetDesc.h" +#include "MCTargetDesc/MipsTargetStreamer.h" #include "Mips.h" #include "MipsMCInstLower.h" #include "MipsMachineFunction.h" #include "MipsSubtarget.h" #include "MipsTargetMachine.h" -#include "MipsTargetStreamer.h" #include "TargetInfo/MipsTargetInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index afec52e289e22..e737c5aeb43c6 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -357,6 +357,10 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + if (Subtarget.hasMips32r2() || + getTargetMachine().getTargetTriple().isOSLinux()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); + // Lower fmin/fmax/fclass operations for MIPS R6. if (Subtarget.hasMips32r6()) { setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); @@ -1315,6 +1319,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::STORE: return lowerSTORE(Op, DAG); case ISD::EH_DWARF_CFA: return lowerEH_DWARF_CFA(Op, DAG); case ISD::FP_TO_SINT: return lowerFP_TO_SINT(Op, DAG); + case ISD::READCYCLECOUNTER: + return lowerREADCYCLECOUNTER(Op, DAG); } return SDValue(); } @@ -2096,6 +2102,44 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicCmpSwapPartword( return exitMBB; } +SDValue MipsTargetLowering::lowerREADCYCLECOUNTER(SDValue Op, + SelectionDAG &DAG) const { + SmallVector Results; + SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + unsigned RdhwrOpc, DestReg; + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + + if (PtrVT == MVT::i64) { + RdhwrOpc = Mips::RDHWR64; + DestReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); + SDNode *Rdhwr = DAG.getMachineNode(RdhwrOpc, DL, MVT::i64, MVT::Glue, + DAG.getRegister(Mips::HWR2, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32)); + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, DestReg, + SDValue(Rdhwr, 0), SDValue(Rdhwr, 1)); + SDValue ResNode = + DAG.getCopyFromReg(Chain, DL, DestReg, MVT::i64, Chain.getValue(1)); + Results.push_back(ResNode); + Results.push_back(ResNode.getValue(1)); + } else { + RdhwrOpc = Mips::RDHWR; + DestReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32)); + SDNode *Rdhwr = DAG.getMachineNode(RdhwrOpc, DL, MVT::i32, MVT::Glue, + DAG.getRegister(Mips::HWR2, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32)); + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, DestReg, + SDValue(Rdhwr, 0), SDValue(Rdhwr, 1)); + SDValue ResNode = + DAG.getCopyFromReg(Chain, DL, DestReg, MVT::i32, Chain.getValue(1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResNode, + DAG.getConstant(0, DL, MVT::i32))); + Results.push_back(ResNode.getValue(1)); + } + + return DAG.getMergeValues(Results, DL); +} + SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // The first operand is the chain, the second is the condition, the third is // the block to branch to if the condition is true. diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index ee1ab6a17a91e..1d5f5e663d531 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -591,6 +591,7 @@ class TargetRegisterClass; bool IsSRA) const; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; /// isEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index dc75814b9796b..016e4f9f7c6b6 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -8,8 +8,8 @@ #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCTargetStreamer.h" #include "PPCInstrInfo.h" -#include "PPCTargetStreamer.h" #include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" diff --git a/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCTargetStreamer.h similarity index 100% rename from llvm/lib/Target/PowerPC/PPCTargetStreamer.h rename to llvm/lib/Target/PowerPC/MCTargetDesc/PPCTargetStreamer.h diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 162d11058266f..5784fe43879fe 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -19,12 +19,12 @@ #include "MCTargetDesc/PPCMCExpr.h" #include "MCTargetDesc/PPCMCTargetDesc.h" #include "MCTargetDesc/PPCPredicates.h" +#include "MCTargetDesc/PPCTargetStreamer.h" #include "PPC.h" #include "PPCInstrInfo.h" #include "PPCMachineFunctionInfo.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" -#include "PPCTargetStreamer.h" #include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 767d1ded8de3a..4720928f472b3 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -8883,8 +8883,8 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, Round = DAG.getNode(ISD::ADD, dl, MVT::i64, Round, DAG.getConstant(2047, dl, MVT::i64)); Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); - Round = DAG.getNode(ISD::AND, dl, MVT::i64, - Round, DAG.getConstant(-2048, dl, MVT::i64)); + Round = DAG.getNode(ISD::AND, dl, MVT::i64, Round, + DAG.getSignedConstant(-2048, dl, MVT::i64)); // However, we cannot use that value unconditionally: if the magnitude // of the input value is small, the bit-twiddling we did above might @@ -9244,7 +9244,7 @@ SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op, SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - unsigned BitWidth = VT.getSizeInBits(); + uint64_t BitWidth = VT.getSizeInBits(); SDLoc dl(Op); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && @@ -9263,7 +9263,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, - DAG.getConstant(-BitWidth, dl, AmtVT)); + DAG.getSignedConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); @@ -9274,7 +9274,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); - unsigned BitWidth = VT.getSizeInBits(); + uint64_t BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SRL!"); @@ -9292,7 +9292,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, - DAG.getConstant(-BitWidth, dl, AmtVT)); + DAG.getSignedConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); @@ -9303,7 +9303,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); - unsigned BitWidth = VT.getSizeInBits(); + uint64_t BitWidth = VT.getSizeInBits(); assert(Op.getNumOperands() == 3 && VT == Op.getOperand(1).getValueType() && "Unexpected SRA!"); @@ -9320,7 +9320,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, - DAG.getConstant(-BitWidth, dl, AmtVT)); + DAG.getSignedConstant(-BitWidth, dl, AmtVT)); SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT), @@ -18308,7 +18308,7 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue AddOrZ = NegConstant != 0 ? Add : Z; SDValue Addc = DAG.getNode(ISD::UADDO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), - AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64), + AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64), DAG.getConstant(0, DL, CarryType)); return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 1ec299e3c8cc0..456fb66917216 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -4336,3 +4336,53 @@ RISCVInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { return std::make_unique(LHS, RHS, Cond); } + +// FIXME: We should remove this if we have a default generic scheduling model. +bool RISCVInstrInfo::isHighLatencyDef(int Opc) const { + unsigned RVVMCOpcode = RISCV::getRVVMCOpcode(Opc); + Opc = RVVMCOpcode ? RVVMCOpcode : Opc; + switch (Opc) { + default: + return false; + // Integer div/rem. + case RISCV::DIV: + case RISCV::DIVW: + case RISCV::DIVU: + case RISCV::DIVUW: + case RISCV::REM: + case RISCV::REMW: + case RISCV::REMU: + case RISCV::REMUW: + // Floating-point div/sqrt. + case RISCV::FDIV_H: + case RISCV::FDIV_S: + case RISCV::FDIV_D: + case RISCV::FDIV_H_INX: + case RISCV::FDIV_S_INX: + case RISCV::FDIV_D_INX: + case RISCV::FDIV_D_IN32X: + case RISCV::FSQRT_H: + case RISCV::FSQRT_S: + case RISCV::FSQRT_D: + case RISCV::FSQRT_H_INX: + case RISCV::FSQRT_S_INX: + case RISCV::FSQRT_D_INX: + case RISCV::FSQRT_D_IN32X: + // Vector integer div/rem + case RISCV::VDIV_VV: + case RISCV::VDIV_VX: + case RISCV::VDIVU_VV: + case RISCV::VDIVU_VX: + case RISCV::VREM_VV: + case RISCV::VREM_VX: + case RISCV::VREMU_VV: + case RISCV::VREMU_VX: + // Vector floating-point div/sqrt. + case RISCV::VFDIV_VV: + case RISCV::VFDIV_VF: + case RISCV::VFRDIV_VF: + case RISCV::VFSQRT_V: + case RISCV::VFRSQRT7_V: + return true; + } +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index ec628620d2982..afbc8df50b452 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -300,6 +300,8 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { std::unique_ptr analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override; + bool isHighLatencyDef(int Opc) const override; + protected: const RISCVSubtarget &STI; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 77f41e3c202c7..33c04d1c05613 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -122,6 +122,7 @@ def DecImm : SDNodeXForm, SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx, forcePassthruRead=true>; - let ForceTailAgnostic = true in def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask, SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx, @@ -4019,7 +4019,7 @@ class VPatMaskUnaryMask(inst#"_M_"#mti.BX#"_MASK") (mti.Mask VR:$passthru), (mti.Mask VR:$rs2), - (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW, TU_MU)>; + (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW, TA_MU)>; class VPatUnaryAnyMask; // 15.2 Vector count population in mask vcpop.m - def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask true_mask), - VLOpFrag)), - (!cast("PseudoVCPOP_M_" # mti.BX) - VR:$rs2, GPR:$vl, mti.Log2SEW)>; def : Pat<(XLenVT (riscv_vcpop_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVCPOP_M_" # mti.BX # "_MASK") VR:$rs2, (mti.Mask VMV0:$vm), GPR:$vl, mti.Log2SEW)>; // 15.3 vfirst find-first-set mask bit - def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask true_mask), - VLOpFrag)), - (!cast("PseudoVFIRST_M_" # mti.BX) - VR:$rs2, GPR:$vl, mti.Log2SEW)>; def : Pat<(XLenVT (riscv_vfirst_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm), VLOpFrag)), (!cast("PseudoVFIRST_M_" # mti.BX # "_MASK") diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index a7a34e0439ab1..7b897f7e34c6f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -216,6 +216,7 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall, // Check if the extracted name begins with: // - "__spirv_ImageSampleExplicitLod" // - "__spirv_ImageRead" + // - "__spirv_ImageWrite" // - "__spirv_ImageQuerySizeLod" // - "__spirv_UDotKHR" // - "__spirv_SDotKHR" @@ -233,20 +234,21 @@ std::string lookupBuiltinNameHelper(StringRef DemangledCall, // - "__spirv_SConvert" // - "__spirv_FConvert" // - "__spirv_SatConvert" - // and contains return type information at the end "_R". + // and maybe contains return type information at the end "_R". // If so, extract the plain builtin name without the type information. static const std::regex SpvWithR( - "(__spirv_(ImageSampleExplicitLod|ImageRead|ImageQuerySizeLod|UDotKHR|" + "(__spirv_(ImageSampleExplicitLod|ImageRead|ImageWrite|ImageQuerySizeLod|" + "UDotKHR|" "SDotKHR|SUDotKHR|SDotAccSatKHR|UDotAccSatKHR|SUDotAccSatKHR|" "ReadClockKHR|SubgroupBlockReadINTEL|SubgroupImageBlockReadINTEL|" "SubgroupImageMediaBlockReadINTEL|SubgroupImageMediaBlockWriteINTEL|" "Convert|" - "UConvert|SConvert|FConvert|SatConvert).*)_R[^_]*_?(\\w+)?.*"); + "UConvert|SConvert|FConvert|SatConvert)[^_]*)(_R[^_]*_?(\\w+)?.*)?"); std::smatch Match; if (std::regex_match(BuiltinName, Match, SpvWithR) && Match.size() > 1) { std::ssub_match SubMatch; if (DecorationId && Match.size() > 3) { - SubMatch = Match[3]; + SubMatch = Match[4]; *DecorationId = demangledPostfixToDecorationId(SubMatch.str()); } SubMatch = Match[1]; @@ -1779,6 +1781,7 @@ static bool generateGetQueryInst(const SPIRV::IncomingCall *Call, SPIRV::BuiltIn::BuiltIn Value = SPIRV::lookupGetBuiltin(Call->Builtin->Name, Call->Builtin->Set)->Value; uint64_t IsDefault = (Value == SPIRV::BuiltIn::GlobalSize || + Value == SPIRV::BuiltIn::NumWorkgroups || Value == SPIRV::BuiltIn::WorkgroupSize || Value == SPIRV::BuiltIn::EnqueuedWorkgroupSize); return genWorkgroupQuery(Call, MIRBuilder, GR, Value, IsDefault ? 1 : 0); @@ -1931,6 +1934,9 @@ static bool generateReadImageInst(const StringRef DemangledCall, const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { + if (Call->isSpirvOp()) + return buildOpFromWrapper(MIRBuilder, SPIRV::OpImageRead, Call, + GR->getSPIRVTypeID(Call->ReturnType)); Register Image = Call->Arguments[0]; MachineRegisterInfo *MRI = MIRBuilder.getMRI(); bool HasOclSampler = DemangledCall.contains_insensitive("ocl_sampler"); @@ -2010,6 +2016,9 @@ static bool generateReadImageInst(const StringRef DemangledCall, static bool generateWriteImageInst(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { + if (Call->isSpirvOp()) + return buildOpFromWrapper(MIRBuilder, SPIRV::OpImageWrite, Call, + Register(0)); MIRBuilder.buildInstr(SPIRV::OpImageWrite) .addUse(Call->Arguments[0]) // Image. .addUse(Call->Arguments[1]) // Coordinate. diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 16f4252173e33..85f42fc08a4e0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -110,11 +110,13 @@ def : DemangledBuiltin<"__spirv_DotAccSat", OpenCL_std, IntegerDot, 3, 3>; def : DemangledBuiltin<"read_imagei", OpenCL_std, ReadImage, 2, 4>; def : DemangledBuiltin<"read_imageui", OpenCL_std, ReadImage, 2, 4>; def : DemangledBuiltin<"read_imagef", OpenCL_std, ReadImage, 2, 4>; +def : DemangledBuiltin<"__spirv_ImageRead", OpenCL_std, ReadImage, 2, 0>; def : DemangledBuiltin<"write_imagef", OpenCL_std, WriteImage, 3, 4>; def : DemangledBuiltin<"write_imagei", OpenCL_std, WriteImage, 3, 4>; def : DemangledBuiltin<"write_imageui", OpenCL_std, WriteImage, 3, 4>; def : DemangledBuiltin<"write_imageh", OpenCL_std, WriteImage, 3, 4>; +def : DemangledBuiltin<"__spirv_ImageWrite", OpenCL_std, WriteImage, 3, 0>; def : DemangledBuiltin<"__translate_sampler_initializer", OpenCL_std, SampleImage, 1, 1>; def : DemangledBuiltin<"__spirv_SampledImage", OpenCL_std, SampleImage, 2, 2>; @@ -1323,6 +1325,15 @@ multiclass DemangledGetBuiltin; +defm : DemangledGetBuiltin<"get_local_linear_id", OpenCL_std, Variable, LocalInvocationIndex>; +defm : DemangledGetBuiltin<"get_work_dim", OpenCL_std, Variable, WorkDim>; +defm : DemangledGetBuiltin<"get_sub_group_size", OpenCL_std, Variable, SubgroupSize>; +defm : DemangledGetBuiltin<"get_max_sub_group_size", OpenCL_std, Variable, SubgroupMaxSize>; +defm : DemangledGetBuiltin<"get_num_sub_groups", OpenCL_std, Variable, NumSubgroups>; +defm : DemangledGetBuiltin<"get_enqueued_num_sub_groups", OpenCL_std, Variable, NumEnqueuedSubgroups>; +defm : DemangledGetBuiltin<"get_sub_group_id", OpenCL_std, Variable, SubgroupId>; +defm : DemangledGetBuiltin<"get_sub_group_local_id", OpenCL_std, Variable, SubgroupLocalInvocationId>; defm : DemangledGetBuiltin<"get_sub_group_eq_mask", OpenCL_std, Variable, SubgroupEqMask>; defm : DemangledGetBuiltin<"get_sub_group_ge_mask", OpenCL_std, Variable, SubgroupGeMask>; defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, SubgroupGtMask>; @@ -1339,6 +1350,7 @@ defm : DemangledGetBuiltin<"get_global_size", OpenCL_std, GetQuery, GlobalSize>; defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>; defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>; +defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>; defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 251bc17fef52a..5dfba8427258f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -135,7 +135,7 @@ class SPIRVEmitIntrinsics // deduce Types of operands of the Instruction if possible void deduceOperandElementType(Instruction *I, - SmallPtrSet *UncompleteRets, + SmallPtrSet *IncompleteRets, const SmallPtrSet *AskOps = nullptr, bool IsPostprocessing = false); @@ -182,12 +182,12 @@ class SPIRVEmitIntrinsics bool deduceOperandElementTypeCalledFunction( CallInst *CI, SmallVector> &Ops, - Type *&KnownElemTy); + Type *&KnownElemTy, bool &Incomplete); void deduceOperandElementTypeFunctionPointer( CallInst *CI, SmallVector> &Ops, Type *&KnownElemTy, bool IsPostprocessing); bool deduceOperandElementTypeFunctionRet( - Instruction *I, SmallPtrSet *UncompleteRets, + Instruction *I, SmallPtrSet *IncompleteRets, const SmallPtrSet *AskOps, bool IsPostprocessing, Type *&KnownElemTy, Value *Op, Function *F); @@ -893,7 +893,7 @@ static inline Type *getAtomicElemTy(SPIRVGlobalRegistry *GR, Instruction *I, // indirect function invocation, and true otherwise. bool SPIRVEmitIntrinsics::deduceOperandElementTypeCalledFunction( CallInst *CI, SmallVector> &Ops, - Type *&KnownElemTy) { + Type *&KnownElemTy, bool &Incomplete) { Function *CalledF = CI->getCalledFunction(); if (!CalledF) return false; @@ -915,12 +915,15 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeCalledFunction( Ops.push_back(std::make_pair(Op, i)); } } else if (Grp == SPIRV::Atomic || Grp == SPIRV::AtomicFloating) { - if (CI->arg_size() < 2) + if (CI->arg_size() == 0) return true; Value *Op = CI->getArgOperand(0); if (!isPointerTy(Op->getType())) return true; switch (Opcode) { + case SPIRV::OpAtomicFAddEXT: + case SPIRV::OpAtomicFMinEXT: + case SPIRV::OpAtomicFMaxEXT: case SPIRV::OpAtomicLoad: case SPIRV::OpAtomicCompareExchangeWeak: case SPIRV::OpAtomicCompareExchange: @@ -934,9 +937,23 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeCalledFunction( case SPIRV::OpAtomicUMax: case SPIRV::OpAtomicSMin: case SPIRV::OpAtomicSMax: { - KnownElemTy = getAtomicElemTy(GR, CI, Op); + KnownElemTy = isPointerTy(CI->getType()) ? getAtomicElemTy(GR, CI, Op) + : CI->getType(); if (!KnownElemTy) return true; + Incomplete = isTodoType(Op); + Ops.push_back(std::make_pair(Op, 0)); + } break; + case SPIRV::OpAtomicStore: { + if (CI->arg_size() < 4) + return true; + Value *ValOp = CI->getArgOperand(3); + KnownElemTy = isPointerTy(ValOp->getType()) + ? getAtomicElemTy(GR, CI, Op) + : ValOp->getType(); + if (!KnownElemTy) + return true; + Incomplete = isTodoType(Op); Ops.push_back(std::make_pair(Op, 0)); } break; } @@ -954,7 +971,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionPointer( return; Ops.push_back(std::make_pair(Op, std::numeric_limits::max())); FunctionType *FTy = CI->getFunctionType(); - bool IsNewFTy = false, IsUncomplete = false; + bool IsNewFTy = false, IsIncomplete = false; SmallVector ArgTys; for (Value *Arg : CI->args()) { Type *ArgTy = Arg->getType(); @@ -963,9 +980,9 @@ void SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionPointer( IsNewFTy = true; ArgTy = getTypedPointerWrapper(ElemTy, getPointerAddressSpace(ArgTy)); if (isTodoType(Arg)) - IsUncomplete = true; + IsIncomplete = true; } else { - IsUncomplete = true; + IsIncomplete = true; } } ArgTys.push_back(ArgTy); @@ -977,19 +994,19 @@ void SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionPointer( RetTy = getTypedPointerWrapper(ElemTy, getPointerAddressSpace(CI->getType())); if (isTodoType(CI)) - IsUncomplete = true; + IsIncomplete = true; } else { - IsUncomplete = true; + IsIncomplete = true; } } - if (!IsPostprocessing && IsUncomplete) + if (!IsPostprocessing && IsIncomplete) insertTodoType(Op); KnownElemTy = IsNewFTy ? FunctionType::get(RetTy, ArgTys, FTy->isVarArg()) : FTy; } bool SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionRet( - Instruction *I, SmallPtrSet *UncompleteRets, + Instruction *I, SmallPtrSet *IncompleteRets, const SmallPtrSet *AskOps, bool IsPostprocessing, Type *&KnownElemTy, Value *Op, Function *F) { KnownElemTy = GR->findDeducedElementType(F); @@ -1018,13 +1035,13 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionRet( // This may happen just once per a function, the latch is a pair of // findDeducedElementType(F) / addDeducedElementType(F, ...). // With or without the latch it is a non-recursive call due to - // UncompleteRets set to nullptr in this call. - if (UncompleteRets) - for (Instruction *UncompleteRetI : *UncompleteRets) - deduceOperandElementType(UncompleteRetI, nullptr, AskOps, + // IncompleteRets set to nullptr in this call. + if (IncompleteRets) + for (Instruction *IncompleteRetI : *IncompleteRets) + deduceOperandElementType(IncompleteRetI, nullptr, AskOps, IsPostprocessing); - } else if (UncompleteRets) { - UncompleteRets->insert(I); + } else if (IncompleteRets) { + IncompleteRets->insert(I); } TypeValidated.insert(I); return true; @@ -1035,17 +1052,17 @@ bool SPIRVEmitIntrinsics::deduceOperandElementTypeFunctionRet( // types which differ from expected, this function tries to insert a bitcast to // resolve the issue. void SPIRVEmitIntrinsics::deduceOperandElementType( - Instruction *I, SmallPtrSet *UncompleteRets, + Instruction *I, SmallPtrSet *IncompleteRets, const SmallPtrSet *AskOps, bool IsPostprocessing) { SmallVector> Ops; Type *KnownElemTy = nullptr; - bool Uncomplete = false; + bool Incomplete = false; // look for known basic patterns of type inference if (auto *Ref = dyn_cast(I)) { if (!isPointerTy(I->getType()) || !(KnownElemTy = GR->findDeducedElementType(I))) return; - Uncomplete = isTodoType(I); + Incomplete = isTodoType(I); for (unsigned i = 0; i < Ref->getNumIncomingValues(); i++) { Value *Op = Ref->getIncomingValue(i); if (isPointerTy(Op->getType())) @@ -1055,7 +1072,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( KnownElemTy = GR->findDeducedElementType(I); if (!KnownElemTy) return; - Uncomplete = isTodoType(I); + Incomplete = isTodoType(I); Ops.push_back(std::make_pair(Ref->getPointerOperand(), 0)); } else if (auto *Ref = dyn_cast(I)) { if (!isPointerTy(I->getType())) @@ -1063,7 +1080,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( KnownElemTy = GR->findDeducedElementType(I); if (!KnownElemTy) return; - Uncomplete = isTodoType(I); + Incomplete = isTodoType(I); Ops.push_back(std::make_pair(Ref->getOperand(0), 0)); } else if (auto *Ref = dyn_cast(I)) { if (GR->findDeducedElementType(Ref->getPointerOperand())) @@ -1090,22 +1107,28 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( Ops.push_back(std::make_pair(Ref->getPointerOperand(), StoreInst::getPointerOperandIndex())); } else if (auto *Ref = dyn_cast(I)) { - KnownElemTy = getAtomicElemTy(GR, I, Ref->getPointerOperand()); + KnownElemTy = isPointerTy(I->getType()) + ? getAtomicElemTy(GR, I, Ref->getPointerOperand()) + : I->getType(); if (!KnownElemTy) return; + Incomplete = isTodoType(Ref->getPointerOperand()); Ops.push_back(std::make_pair(Ref->getPointerOperand(), AtomicCmpXchgInst::getPointerOperandIndex())); } else if (auto *Ref = dyn_cast(I)) { - KnownElemTy = getAtomicElemTy(GR, I, Ref->getPointerOperand()); + KnownElemTy = isPointerTy(I->getType()) + ? getAtomicElemTy(GR, I, Ref->getPointerOperand()) + : I->getType(); if (!KnownElemTy) return; + Incomplete = isTodoType(Ref->getPointerOperand()); Ops.push_back(std::make_pair(Ref->getPointerOperand(), AtomicRMWInst::getPointerOperandIndex())); } else if (auto *Ref = dyn_cast(I)) { if (!isPointerTy(I->getType()) || !(KnownElemTy = GR->findDeducedElementType(I))) return; - Uncomplete = isTodoType(I); + Incomplete = isTodoType(I); for (unsigned i = 0; i < Ref->getNumOperands(); i++) { Value *Op = Ref->getOperand(i); if (isPointerTy(Op->getType())) @@ -1117,11 +1140,11 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( Value *Op = Ref->getReturnValue(); if (!Op) return; - if (deduceOperandElementTypeFunctionRet(I, UncompleteRets, AskOps, + if (deduceOperandElementTypeFunctionRet(I, IncompleteRets, AskOps, IsPostprocessing, KnownElemTy, Op, CurrF)) return; - Uncomplete = isTodoType(CurrF); + Incomplete = isTodoType(CurrF); Ops.push_back(std::make_pair(Op, 0)); } else if (auto *Ref = dyn_cast(I)) { if (!isPointerTy(Ref->getOperand(0)->getType())) @@ -1132,16 +1155,16 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( Type *ElemTy1 = GR->findDeducedElementType(Op1); if (ElemTy0) { KnownElemTy = ElemTy0; - Uncomplete = isTodoType(Op0); + Incomplete = isTodoType(Op0); Ops.push_back(std::make_pair(Op1, 1)); } else if (ElemTy1) { KnownElemTy = ElemTy1; - Uncomplete = isTodoType(Op1); + Incomplete = isTodoType(Op1); Ops.push_back(std::make_pair(Op0, 0)); } } else if (CallInst *CI = dyn_cast(I)) { if (!CI->isIndirectCall()) - deduceOperandElementTypeCalledFunction(CI, Ops, KnownElemTy); + deduceOperandElementTypeCalledFunction(CI, Ops, KnownElemTy, Incomplete); else if (HaveFunPtrs) deduceOperandElementTypeFunctionPointer(CI, Ops, KnownElemTy, IsPostprocessing); @@ -1175,7 +1198,7 @@ void SPIRVEmitIntrinsics::deduceOperandElementType( Type *PrevElemTy = GR->findDeducedElementType(Op); GR->addDeducedElementType(Op, normalizeType(KnownElemTy)); // check if KnownElemTy is complete - if (!Uncomplete) + if (!Incomplete) eraseTodoType(Op); else if (!IsPostprocessing) insertTodoType(Op); @@ -2394,9 +2417,9 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { // Pass backward: use instructions results to specify/update/cast operands // where needed. - SmallPtrSet UncompleteRets; + SmallPtrSet IncompleteRets; for (auto &I : llvm::reverse(instructions(Func))) - deduceOperandElementType(&I, &UncompleteRets); + deduceOperandElementType(&I, &IncompleteRets); // Pass forward for PHIs only, their operands are not preceed the instruction // in meaning of `instructions(Func)`. @@ -2465,7 +2488,7 @@ bool SPIRVEmitIntrinsics::postprocessTypes(Module &M) { for (auto &F : M) { CurrF = &F; - SmallPtrSet UncompleteRets; + SmallPtrSet IncompleteRets; for (auto &I : llvm::reverse(instructions(F))) { auto It = ToProcess.find(&I); if (It == ToProcess.end()) @@ -2473,7 +2496,7 @@ bool SPIRVEmitIntrinsics::postprocessTypes(Module &M) { It->second.remove_if([this](Value *V) { return !isTodoType(V); }); if (It->second.size() == 0) continue; - deduceOperandElementType(&I, &UncompleteRets, &It->second, true); + deduceOperandElementType(&I, &IncompleteRets, &It->second, true); if (TodoTypeSz == 0) return true; } diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index b892c9ea69602..4b26437c5fecb 100644 --- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -9,7 +9,7 @@ #include "MCTargetDesc/SystemZGNUInstPrinter.h" #include "MCTargetDesc/SystemZMCAsmInfo.h" #include "MCTargetDesc/SystemZMCTargetDesc.h" -#include "SystemZTargetStreamer.h" +#include "MCTargetDesc/SystemZTargetStreamer.h" #include "TargetInfo/SystemZTargetInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index f6951c39ce9be..e84368c769e29 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -35,100 +35,81 @@ using namespace llvm; #include "SystemZGenRegisterInfo.inc" const unsigned SystemZMC::GR32Regs[16] = { - SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L, - SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L, - SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L, - SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L -}; + SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L, + SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L, + SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L, + SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L}; const unsigned SystemZMC::GRH32Regs[16] = { - SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H, - SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H, - SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H, - SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H -}; + SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H, + SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H, + SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H, + SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H}; const unsigned SystemZMC::GR64Regs[16] = { - SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D, - SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D, - SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D, - SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D -}; + SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D, + SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D, + SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D, + SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D}; const unsigned SystemZMC::GR128Regs[16] = { - SystemZ::R0Q, 0, SystemZ::R2Q, 0, - SystemZ::R4Q, 0, SystemZ::R6Q, 0, - SystemZ::R8Q, 0, SystemZ::R10Q, 0, - SystemZ::R12Q, 0, SystemZ::R14Q, 0 -}; + SystemZ::R0Q, 0, SystemZ::R2Q, 0, SystemZ::R4Q, 0, SystemZ::R6Q, 0, + SystemZ::R8Q, 0, SystemZ::R10Q, 0, SystemZ::R12Q, 0, SystemZ::R14Q, 0}; const unsigned SystemZMC::FP32Regs[16] = { - SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, - SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, - SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S, - SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S -}; + SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, + SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, + SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S, + SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S}; const unsigned SystemZMC::FP64Regs[16] = { - SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, - SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, - SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D, - SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D -}; + SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, + SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, + SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D, + SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D}; const unsigned SystemZMC::FP128Regs[16] = { - SystemZ::F0Q, SystemZ::F1Q, 0, 0, - SystemZ::F4Q, SystemZ::F5Q, 0, 0, - SystemZ::F8Q, SystemZ::F9Q, 0, 0, - SystemZ::F12Q, SystemZ::F13Q, 0, 0 -}; + SystemZ::F0Q, SystemZ::F1Q, 0, 0, SystemZ::F4Q, SystemZ::F5Q, 0, 0, + SystemZ::F8Q, SystemZ::F9Q, 0, 0, SystemZ::F12Q, SystemZ::F13Q, 0, 0}; const unsigned SystemZMC::VR32Regs[32] = { - SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, - SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, - SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S, - SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S, - SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S, - SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S, - SystemZ::F24S, SystemZ::F25S, SystemZ::F26S, SystemZ::F27S, - SystemZ::F28S, SystemZ::F29S, SystemZ::F30S, SystemZ::F31S -}; + SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S, SystemZ::F4S, + SystemZ::F5S, SystemZ::F6S, SystemZ::F7S, SystemZ::F8S, SystemZ::F9S, + SystemZ::F10S, SystemZ::F11S, SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, + SystemZ::F15S, SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S, + SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S, SystemZ::F24S, + SystemZ::F25S, SystemZ::F26S, SystemZ::F27S, SystemZ::F28S, SystemZ::F29S, + SystemZ::F30S, SystemZ::F31S}; const unsigned SystemZMC::VR64Regs[32] = { - SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, - SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, - SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D, - SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D, - SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D, - SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D, - SystemZ::F24D, SystemZ::F25D, SystemZ::F26D, SystemZ::F27D, - SystemZ::F28D, SystemZ::F29D, SystemZ::F30D, SystemZ::F31D -}; + SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D, SystemZ::F4D, + SystemZ::F5D, SystemZ::F6D, SystemZ::F7D, SystemZ::F8D, SystemZ::F9D, + SystemZ::F10D, SystemZ::F11D, SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, + SystemZ::F15D, SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D, + SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D, SystemZ::F24D, + SystemZ::F25D, SystemZ::F26D, SystemZ::F27D, SystemZ::F28D, SystemZ::F29D, + SystemZ::F30D, SystemZ::F31D}; const unsigned SystemZMC::VR128Regs[32] = { - SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3, - SystemZ::V4, SystemZ::V5, SystemZ::V6, SystemZ::V7, - SystemZ::V8, SystemZ::V9, SystemZ::V10, SystemZ::V11, - SystemZ::V12, SystemZ::V13, SystemZ::V14, SystemZ::V15, - SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19, - SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23, - SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27, - SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31 -}; + SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3, SystemZ::V4, + SystemZ::V5, SystemZ::V6, SystemZ::V7, SystemZ::V8, SystemZ::V9, + SystemZ::V10, SystemZ::V11, SystemZ::V12, SystemZ::V13, SystemZ::V14, + SystemZ::V15, SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19, + SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23, SystemZ::V24, + SystemZ::V25, SystemZ::V26, SystemZ::V27, SystemZ::V28, SystemZ::V29, + SystemZ::V30, SystemZ::V31}; const unsigned SystemZMC::AR32Regs[16] = { - SystemZ::A0, SystemZ::A1, SystemZ::A2, SystemZ::A3, - SystemZ::A4, SystemZ::A5, SystemZ::A6, SystemZ::A7, - SystemZ::A8, SystemZ::A9, SystemZ::A10, SystemZ::A11, - SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15 -}; + SystemZ::A0, SystemZ::A1, SystemZ::A2, SystemZ::A3, + SystemZ::A4, SystemZ::A5, SystemZ::A6, SystemZ::A7, + SystemZ::A8, SystemZ::A9, SystemZ::A10, SystemZ::A11, + SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15}; const unsigned SystemZMC::CR64Regs[16] = { - SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3, - SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7, - SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11, - SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15 -}; + SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3, + SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7, + SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11, + SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15}; unsigned SystemZMC::getFirstReg(unsigned Reg) { static unsigned Map[SystemZ::NUM_TARGET_REGS]; diff --git a/llvm/lib/Target/SystemZ/SystemZTargetStreamer.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.h similarity index 100% rename from llvm/lib/Target/SystemZ/SystemZTargetStreamer.h rename to llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.h diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h index 2696702b44551..47e7f67e2cdc7 100644 --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h @@ -9,9 +9,9 @@ #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H +#include "MCTargetDesc/SystemZTargetStreamer.h" #include "SystemZMCInstLower.h" #include "SystemZTargetMachine.h" -#include "SystemZTargetStreamer.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/MC/MCInstBuilder.h" diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index aff058868f306..62064579b4bdf 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -1216,8 +1216,9 @@ SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op, SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits); SDValue Ptr = N->getOperand(1); - SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), - {Ptr, DAG.getConstant(-4, DL, MVT::i64)}); + SDValue Aligned = + DAG.getNode(ISD::AND, DL, Ptr.getValueType(), + {Ptr, DAG.getSignedConstant(-4, DL, MVT::i64)}); SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(), DAG.getVTList(Op.getNode()->getValueType(0), Op.getNode()->getValueType(1)), @@ -1235,8 +1236,9 @@ SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op, SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits); SDValue Ptr = N->getOperand(1); - SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), - {Ptr, DAG.getConstant(-4, DL, MVT::i64)}); + SDValue Aligned = + DAG.getNode(ISD::AND, DL, Ptr.getValueType(), + {Ptr, DAG.getSignedConstant(-4, DL, MVT::i64)}); SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(), DAG.getVTList(Op.getNode()->getValueType(0), Op.getNode()->getValueType(1)), @@ -1601,7 +1603,7 @@ SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const { VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, - DAG.getConstant(-Align, DL, PtrVT)); + DAG.getSignedConstant(-Align, DL, PtrVT)); // Increment the pointer, VAList, by 16 to the next vaarg. NextPtr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL)); diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 50c56c9dd08b3..10fb6994b51b6 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2639,11 +2639,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. - int64_t Offset = -1 * X86FI->getTCReturnAddrDelta(); - assert(Offset >= 0 && "TCDelta should never be positive"); - if (Offset) { + int64_t Delta = X86FI->getTCReturnAddrDelta(); + assert(Delta <= 0 && "TCDelta should never be positive"); + if (Delta) { // Check for possible merge with preceding ADD instruction. - Offset = mergeSPAdd(MBB, Terminator, Offset, true); + int64_t Offset = mergeSPAdd(MBB, Terminator, -Delta, true); emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true); } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 21b08a4a93fc7..386d56dcda9de 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4679,9 +4679,24 @@ static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); - assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || - ISD::ZERO_EXTEND == Opcode) && - "Unknown extension opcode"); + + // Canonicalize Opcode to general extension version. + switch (Opcode) { + case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND_VECTOR_INREG: + Opcode = ISD::ANY_EXTEND; + break; + case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: + Opcode = ISD::SIGN_EXTEND; + break; + case ISD::ZERO_EXTEND: + case ISD::ZERO_EXTEND_VECTOR_INREG: + Opcode = ISD::ZERO_EXTEND; + break; + default: + llvm_unreachable("Unknown extension opcode"); + } // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. @@ -57582,7 +57597,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType(); if (SubVT.isSimple() && SubVT.isVector()) { EVT ConcatVT = - EVT::getVectorVT(*DAG.getContext(), SubVT.getScalarType(), + EVT::getVectorVT(Ctx, SubVT.getScalarType(), SubVT.getVectorElementCount() * Subs.size()); for (SDValue &Sub : Subs) Sub = DAG.getBitcast(SubVT, Sub); @@ -57864,6 +57879,32 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } break; + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: { + // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND. + if (!IsSplat && NumOps == 2 && + ((VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.useAVX512Regs() && + (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) && + Op0.getOperand(0).getValueType().is128BitVector() && + Op0.getOperand(0).getValueType() == + Ops[0].getOperand(0).getValueType()) { + EVT SrcVT = Op0.getOperand(0).getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + MVT UnpackSVT = + MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2)); + MVT UnpackVT = + MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits()); + SDValue Unpack = + DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT, + DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)), + DAG.getBitcast(UnpackVT, Ops[1].getOperand(0))); + return getEXTEND_VECTOR_INREG(Op0.getOpcode(), DL, VT, + DAG.getBitcast(SrcVT, Unpack), DAG); + } + break; + } case X86ISD::VSHLI: case X86ISD::VSRLI: // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle. diff --git a/llvm/lib/Target/XCore/XCoreTargetStreamer.h b/llvm/lib/Target/XCore/MCTargetDesc/XCoreTargetStreamer.h similarity index 100% rename from llvm/lib/Target/XCore/XCoreTargetStreamer.h rename to llvm/lib/Target/XCore/MCTargetDesc/XCoreTargetStreamer.h diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp index 15be47a73cef3..a1f7608224b90 100644 --- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -12,12 +12,12 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/XCoreInstPrinter.h" +#include "MCTargetDesc/XCoreTargetStreamer.h" #include "TargetInfo/XCoreTargetInfo.h" #include "XCore.h" #include "XCoreMCInstLower.h" #include "XCoreSubtarget.h" #include "XCoreTargetMachine.h" -#include "XCoreTargetStreamer.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/AsmPrinter.h" diff --git a/llvm/lib/Telemetry/Telemetry.cpp b/llvm/lib/Telemetry/Telemetry.cpp index 9e13d08334e3b..d86ad9c1c37bb 100644 --- a/llvm/lib/Telemetry/Telemetry.cpp +++ b/llvm/lib/Telemetry/Telemetry.cpp @@ -1,3 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file provides the basic framework for Telemetry. +/// Refer to its documentation at llvm/docs/Telemetry.rst for more details. +//===---------------------------------------------------------------------===// + #include "llvm/Telemetry/Telemetry.h" namespace llvm { @@ -22,5 +35,7 @@ void Manager::addDestination(std::unique_ptr Dest) { Destinations.push_back(std::move(Dest)); } +Error Manager::preDispatch(TelemetryInfo *Entry) { return Error::success(); } + } // namespace telemetry } // namespace llvm diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index a66d7ce9c3f50..02b0fcb3981a7 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -651,9 +651,9 @@ struct ArgumentUsesSummary { SmallDenseMap UsesPerBlock; }; -ArgumentAccessInfo getArgmentAccessInfo(const Instruction *I, - const ArgumentUse &ArgUse, - const DataLayout &DL) { +ArgumentAccessInfo getArgumentAccessInfo(const Instruction *I, + const ArgumentUse &ArgUse, + const DataLayout &DL) { auto GetTypeAccessRange = [&DL](Type *Ty, std::optional Offset) -> std::optional { @@ -805,7 +805,7 @@ ArgumentUsesSummary collectArgumentUsesPerBlock(Argument &A, Function &F) { } auto *I = cast(U); - bool HasWrite = UpdateUseInfo(I, getArgmentAccessInfo(I, ArgUse, DL)); + bool HasWrite = UpdateUseInfo(I, getArgumentAccessInfo(I, ArgUse, DL)); Result.HasAnyWrite |= HasWrite; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 7ef95800975db..90cd279e8a457 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -1613,6 +1613,22 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { if (Instruction *Overflow = foldLShrOverflowBit(I)) return Overflow; + // Transform ((pow2 << x) >> cttz(pow2 << y)) -> ((1 << x) >> y) + Value *Shl0_Op0, *Shl0_Op1, *Shl1_Op1; + BinaryOperator *Shl1; + if (match(Op0, m_Shl(m_Value(Shl0_Op0), m_Value(Shl0_Op1))) && + match(Op1, m_Intrinsic(m_BinOp(Shl1))) && + match(Shl1, m_Shl(m_Specific(Shl0_Op0), m_Value(Shl1_Op1))) && + isKnownToBeAPowerOfTwo(Shl0_Op0, /*OrZero=*/true, 0, &I)) { + auto *Shl0 = cast(Op0); + bool HasNUW = Shl0->hasNoUnsignedWrap() && Shl1->hasNoUnsignedWrap(); + bool HasNSW = Shl0->hasNoSignedWrap() && Shl1->hasNoSignedWrap(); + if (HasNUW || HasNSW) { + Value *NewShl = Builder.CreateShl(ConstantInt::get(Shl1->getType(), 1), + Shl0_Op1, "", HasNUW, HasNSW); + return BinaryOperator::CreateLShr(NewShl, Shl1_Op1); + } + } return nullptr; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index fbbc466f2f7f6..8089cfd1ce802 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1958,13 +1958,12 @@ class VPScalarPHIRecipe : public VPHeaderPHIRecipe { #endif }; -/// A recipe for handling phis that are widened in the vector loop. -/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are -/// managed in the recipe directly. +/// A recipe for widened phis. Incoming values are operands of the recipe and +/// their operand index corresponds to the incoming predecessor block. If the +/// recipe is placed in an entry block to a (non-replicate) region, it must have +/// exactly 2 incoming values, the first from the predecessor of the region and +/// the second from the exiting block of the region. class VPWidenPHIRecipe : public VPSingleDefRecipe { - /// List of incoming blocks. Only used in the VPlan native path. - SmallVector IncomingBlocks; - public: /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and /// debug location \p DL. @@ -1991,19 +1990,8 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe { VPSlotTracker &SlotTracker) const override; #endif - /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. - void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { - addOperand(IncomingV); - IncomingBlocks.push_back(IncomingBlock); - } - /// Returns the \p I th incoming VPBasicBlock. - VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } - - /// Set the \p I th incoming VPBasicBlock to \p IncomingBlock. - void setIncomingBlock(unsigned I, VPBasicBlock *IncomingBlock) { - IncomingBlocks[I] = IncomingBlock; - } + VPBasicBlock *getIncomingBlock(unsigned I); /// Returns the \p I th incoming VPValue. VPValue *getIncomingValue(unsigned I) { return getOperand(I); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 5a2e5d7cfee48..70d8575ba82c5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -134,21 +134,26 @@ void PlainCFGBuilder::fixPhiNodes() { if (isHeaderBB(Phi->getParent(), L)) { // For header phis, make sure the incoming value from the loop // predecessor is the first operand of the recipe. - assert(Phi->getNumOperands() == 2); + assert(Phi->getNumOperands() == 2 && + "header phi must have exactly 2 operands"); BasicBlock *LoopPred = L->getLoopPredecessor(); - VPPhi->addIncoming( - getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred)), - BB2VPBB[LoopPred]); + VPPhi->addOperand( + getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopPred))); BasicBlock *LoopLatch = L->getLoopLatch(); - VPPhi->addIncoming( - getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch)), - BB2VPBB[LoopLatch]); + VPPhi->addOperand( + getOrCreateVPOperand(Phi->getIncomingValueForBlock(LoopLatch))); continue; } - for (unsigned I = 0; I != Phi->getNumOperands(); ++I) - VPPhi->addIncoming(getOrCreateVPOperand(Phi->getIncomingValue(I)), - BB2VPBB[Phi->getIncomingBlock(I)]); + // Add operands for VPPhi in the order matching its predecessors in VPlan. + DenseMap VPPredToIncomingValue; + for (unsigned I = 0; I != Phi->getNumOperands(); ++I) { + VPPredToIncomingValue[BB2VPBB[Phi->getIncomingBlock(I)]] = + getOrCreateVPOperand(Phi->getIncomingValue(I)); + } + for (VPBlockBase *Pred : VPPhi->getParent()->getPredecessors()) + VPPhi->addOperand( + VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock())); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1bba667c206cf..d57a6c481748c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3621,6 +3621,27 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +VPBasicBlock *VPWidenPHIRecipe::getIncomingBlock(unsigned I) { + VPBasicBlock *Parent = getParent(); + VPBlockBase *Pred = nullptr; + if (Parent->getNumPredecessors() > 0) { + Pred = Parent->getPredecessors()[I]; + } else { + auto *Region = Parent->getParent(); + assert(Region && !Region->isReplicator() && Region->getEntry() == Parent && + "must be in the entry block of a non-replicate region"); + assert( + I < 2 && getNumOperands() == 2 && + "when placed in an entry block, only 2 incoming blocks are available"); + + // I == 0 selects the predecessor of the region, I == 1 selects the region + // itself whose exiting block feeds the phi across the backedge. + Pred = I == 0 ? Region->getSinglePredecessor() : Region; + } + + return Pred->getExitingBasicBlock(); +} + void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index ac5e1978fcfbe..6ddb88308955f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -169,16 +169,8 @@ class VPBlockUtils { static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New) { for (auto *Pred : to_vector(Old->getPredecessors())) Pred->replaceSuccessor(Old, New); - for (auto *Succ : to_vector(Old->getSuccessors())) { + for (auto *Succ : to_vector(Old->getSuccessors())) Succ->replacePredecessor(Old, New); - - // Replace any references to Old in widened phi incoming blocks. - for (auto &R : Succ->getEntryBasicBlock()->phis()) - if (auto *WidenPhiR = dyn_cast(&R)) - for (unsigned I = 0; I < WidenPhiR->getNumOperands(); I++) - if (WidenPhiR->getIncomingBlock(I) == Old) - WidenPhiR->setIncomingBlock(I, cast(New)); - } New->setPredecessors(Old->getPredecessors()); New->setSuccessors(Old->getSuccessors()); Old->clearPredecessors(); diff --git a/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll new file mode 100644 index 0000000000000..cef11b94f3873 --- /dev/null +++ b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll @@ -0,0 +1,24 @@ +; RUN: opt -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 < %s | FileCheck %s + +declare { ptr, i1 } @get_struct() +declare <2 x ptr> @get_vec() + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractvalue() { + %a = alloca i32 + %call = call { ptr, i1 } @get_struct() + %extract = extractvalue { ptr, i1 } %call, 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractelement() { + %a = alloca i32 + %call = call <2 x ptr> @get_vec() + %extract = extractelement <2 x ptr> %call, i32 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} diff --git a/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll b/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll new file mode 100644 index 0000000000000..b27937862b261 --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='print' -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "p:16:16" + +define void @geps_may_wrap(ptr %a, ptr %b, i64 %N) { +; CHECK-LABEL: 'geps_may_wrap' +; CHECK-NEXT: loop: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.iv = getelementptr i32, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]): +; CHECK-NEXT: ptr %b +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP1]]: +; CHECK-NEXT: (Low: %a High: (16 + (12 * (trunc i128 ((zext i64 %N to i128) /u 3) to i16)) + %a)) +; CHECK-NEXT: Member: {%a,+,12}<%loop> +; CHECK-NEXT: Group [[GRP2]]: +; CHECK-NEXT: (Low: %b High: (4 + %b)) +; CHECK-NEXT: Member: %b +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {0,+,3}<%loop> Added Flags: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.iv = getelementptr i32, ptr %a, i64 %iv + store i32 0, ptr %gep.iv, align 1 + store i32 0, ptr %b, align 1 + %iv.next = add i64 %iv, 3 + %.not = icmp ult i64 %N, %iv + br i1 %.not, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll index 525995156481c..8603417081067 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll @@ -140,6 +140,53 @@ exit: ret void } +; Test with multiple GEP indices +define void @single_stride_array(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride) { +; CHECK-LABEL: 'single_stride_array' +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop +; CHECK-NEXT: Backward loop carried data dependence. +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Backward: +; CHECK-NEXT: %load = load [2 x i32], ptr %gep.A, align 4 -> +; CHECK-NEXT: store [2 x i32] %ins, ptr %gep.A.next, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Equal predicate: %stride == 1 +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: [PSE] %gep.A = getelementptr inbounds [2 x i32], ptr %A, i64 %mul, i64 1: +; CHECK-NEXT: {(4 + %A),+,(8 * %stride)}<%loop> +; CHECK-NEXT: --> {(4 + %A),+,8}<%loop> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %mul = mul i64 %iv, %stride + %gep.A = getelementptr inbounds [2 x i32], ptr %A, i64 %mul, i64 1 + %load = load [2 x i32], ptr %gep.A, align 4 + %gep.B = getelementptr inbounds [2 x i32], ptr %B, i64 %iv + %load_1 = load [2 x i32], ptr %gep.B, align 4 + %v1 = extractvalue [2 x i32] %load, 0 + %v2 = extractvalue [2 x i32] %load_1, 0 + %add = add i32 %v1, %v2 + %ins = insertvalue [2 x i32] poison, i32 %add, 0 + %iv.next = add nuw nsw i64 %iv, 1 + %gep.A.next = getelementptr inbounds [2 x i32], ptr %A, i64 %iv.next + store [2 x i32] %ins, ptr %gep.A.next, align 4 + %exitcond = icmp eq i64 %iv.next, %N + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + define void @single_stride_castexpr(i32 %offset, ptr %src, ptr %dst, i1 %cond) { ; CHECK-LABEL: 'single_stride_castexpr' ; CHECK-NEXT: inner.loop: diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index 9ece9edb84343..40daf8ffb63ea 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM ; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM +; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { ; CHECK-DOT-LABEL: udot: diff --git a/llvm/test/CodeGen/AArch64/shift-const-ne-0.ll b/llvm/test/CodeGen/AArch64/shift-const-ne-0.ll new file mode 100644 index 0000000000000..be064d591613c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/shift-const-ne-0.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-unknown-unknown < %s -o -| FileCheck %s + +define i1 @lsr_1_ne_0_16(i16 %x) { +; CHECK-LABEL: lsr_1_ne_0_16: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xfffe +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i16 %x, 1 + %cmp = icmp ne i16 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_1_ne_0_32(i32 %x) { +; CHECK-LABEL: lsr_1_ne_0_32: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xfffffffe +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i32 %x, 1 + %cmp = icmp ne i32 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_30_ne_0_32(i32 %x) { +; CHECK-LABEL: lsr_30_ne_0_32: +; CHECK: // %bb.0: +; CHECK-NEXT: tst w0, #0xc0000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i32 %x, 30 + %cmp = icmp ne i32 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_31_ne_0_32(i32 %x) { +; CHECK-LABEL: lsr_31_ne_0_32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr w0, w0, #31 +; CHECK-NEXT: ret + %shr = lshr i32 %x, 31 + %cmp = icmp ne i32 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_1_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_1_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xfffffffffffffffe +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 1 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_31_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_31_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xffffffff80000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 31 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_32_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_32_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xffffffff00000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 32 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_33_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_33_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xfffffffe00000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 33 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_62_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_62_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: tst x0, #0xc000000000000000 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %shr = lshr i64 %x, 62 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define i1 @lsr_63_ne_0_64(i64 %x) { +; CHECK-LABEL: lsr_63_ne_0_64: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr x0, x0, #63 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %shr = lshr i64 %x, 63 + %cmp = icmp ne i64 %shr, 0 + ret i1 %cmp +} + +define <4 x i1> @lsr_1_ne_0_v4i16(<4 x i16> %x) { +; CHECK-LABEL: lsr_1_ne_0_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ushr v0.4h, v0.4h, #1 +; CHECK-NEXT: cmtst v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret + %shr = lshr <4 x i16> %x, + %cmp = icmp ne <4 x i16> %shr, + ret <4 x i1> %cmp +} diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index 66f83c658ff4f..455231dd37be6 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -1,12 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM +; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING define @udot( %acc, %a, %b) { ; CHECK-LABEL: udot: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: udot z0.s, z1.b, z2.b ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = zext %b to @@ -20,6 +44,29 @@ define @udot_wide( %acc, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: udot z0.d, z1.h, z2.h ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = zext %b to @@ -33,6 +80,29 @@ define @sdot( %accc, %a, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sdot z0.s, z1.b, z2.b ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to %b.wide = sext %b to @@ -46,6 +116,29 @@ define @sdot_wide( %acc, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sdot z0.d, z1.h, z2.h ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_wide: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to %b.wide = sext %b to @@ -82,6 +175,29 @@ define @usdot( %acc, %a, ; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s ; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOI8MM-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: usdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = sext %b to @@ -118,6 +234,29 @@ define @sudot( %acc, %a, ; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s ; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOI8MM-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sudot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h +; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h +; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s +; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s +; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to %b.wide = zext %b to @@ -136,6 +275,63 @@ define @udot_8to64( %acc, %a to %b.wide = zext %b to @@ -155,6 +351,63 @@ define @sdot_8to64( %acc, %a to %b.wide = sext %b to @@ -231,6 +484,63 @@ define @usdot_8to64( %acc, %a to %b.wide = sext %b to @@ -307,6 +617,63 @@ define @sudot_8to64( %acc, %a to %b.wide = zext %b to @@ -322,6 +689,20 @@ define @udot_no_bin_op( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %acc, %a.ext) ret %partial.reduce @@ -333,6 +714,20 @@ define @sdot_no_bin_op( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %acc, %a.ext) ret %partial.reduce @@ -344,6 +739,20 @@ define @udot_no_bin_op_wide( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %acc, %a.wide) @@ -356,6 +765,20 @@ define @sdot_no_bin_op_wide( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %acc, %a.wide) @@ -373,6 +796,32 @@ define @udot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) ret %partial.reduce @@ -389,6 +838,32 @@ define @sdot_no_bin_op_8to64( %acc, %a to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64( %acc, %a.ext) ret %partial.reduce @@ -407,6 +882,19 @@ define @not_udot( %acc, % ; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s ; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_udot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: ptrue p0.s +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s +; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = zext %b to @@ -428,6 +916,19 @@ define @not_udot_wide( %acc, %a to %b.wide = zext %b to @@ -459,6 +960,29 @@ define @not_usdot( %acc, ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_usdot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = zext %a to %b.wide = sext %b to @@ -490,6 +1014,29 @@ define @not_sudot( %acc, ; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: not_sudot: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEWLOWERING-NEXT: ptrue p0.d +; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s +; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s +; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d +; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d +; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d +; CHECK-NEWLOWERING-NEXT: movprfx z1, z3 +; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret entry: %a.wide = sext %a to %b.wide = zext %b to @@ -522,6 +1069,30 @@ define @udot_different_types( %acc, %a to %b.wide = zext %b to @@ -555,6 +1126,31 @@ define @sdot_different_types( %acc, %a to %b.wide = sext %b to @@ -588,6 +1184,31 @@ define @usdot_different_types( %acc, %a to %b.wide = sext %b to @@ -620,6 +1241,30 @@ define @sudot_different_types( %acc, %a to %b.wide = zext %b to @@ -627,3 +1272,89 @@ entry: %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %acc, %mult) ret %partial.reduce } + +define @udot_nxv8i8_promote ( %acc, %a, %b){ +; CHECK-LABEL: udot_nxv8i8_promote: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z3.d, z2.s +; CHECK-NEXT: uunpklo z4.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff +; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEWLOWERING-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret +entry: + %a.wide = zext %a to + %b.wide = zext %b to + %mult = mul nuw nsw %a.wide, %b.wide + %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16( %acc, %mult) + ret %partial.reduce +} + +define @sdot_nxv8i8_promote ( %acc, %a, %b){ +; CHECK-LABEL: sdot_nxv8i8_promote: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEXT: sxtb z2.h, p0/m, z2.h +; CHECK-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z3.d, z2.s +; CHECK-NEXT: uunpklo z4.d, z1.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEXT: ret +; +; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote: +; CHECK-NEWLOWERING: // %bb.0: // %entry +; CHECK-NEWLOWERING-NEXT: ptrue p0.h +; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h +; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d +; CHECK-NEWLOWERING-NEXT: add z2.d, z2.d, z4.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d +; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d +; CHECK-NEWLOWERING-NEXT: ret +entry: + %a.wide = sext %a to + %b.wide = sext %b to + %mult = mul nuw nsw %a.wide, %b.wide + %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16( %acc, %mult) + ret %partial.reduce +} diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll index b4b946c68566e..11fb60ead4fb2 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2 ; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE +; RUN: llc -mtriple=aarch64 -mattr=+sve2 -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEWLOWERING define @signed_wide_add_nxv4i32( %acc, %input){ ; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32: @@ -16,6 +17,14 @@ define @signed_wide_add_nxv4i32( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64( %acc, %input.wide) @@ -36,6 +45,14 @@ define @unsigned_wide_add_nxv4i32( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64( %acc, %input.wide) @@ -56,6 +73,14 @@ define @signed_wide_add_nxv8i16( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32( %acc, %input.wide) @@ -76,6 +101,14 @@ define @unsigned_wide_add_nxv8i16( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32( %acc, %input.wide) @@ -96,6 +129,14 @@ define @signed_wide_add_nxv16i8( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16( %acc, %input.wide) @@ -116,6 +157,14 @@ define @unsigned_wide_add_nxv16i8( %acc, %input to %partial.reduce = tail call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16( %acc, %input.wide) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index f2a4332bcb8ba..c136028f2de43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2872,8 +2872,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: dyn_extract_v7f64_v_v: @@ -2898,8 +2898,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: dyn_extract_v7f64_v_v: @@ -2918,7 +2918,7 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) { ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v0 :: v_dual_cndmask_b32 v1, v1, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x double> %vec, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll index 5f56568ef88e4..afcd9b5fcdc7e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll @@ -444,6 +444,652 @@ define float @no_unsafe(ptr %addr, float %val) { ret float %res } +@global = hidden addrspace(1) global i64 0, align 8 + +; Make sure there is no error on an invalid addrspacecast without optimizations +define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 { +; GFX908-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: s_mov_b32 s6, 32 +; GFX908-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX908-NEXT: s_getpc_b64 s[6:7] +; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX908-NEXT: s_cmp_eq_u32 s7, s4 +; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX908-NEXT: s_mov_b64 s[4:5], -1 +; GFX908-NEXT: s_mov_b32 s6, 1 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX908-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX908-NEXT: .LBB4_1: ; %Flow +; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX908-NEXT: s_mov_b32 s4, 1 +; GFX908-NEXT: v_cmp_ne_u32_e64 s[4:5], v2, s4 +; GFX908-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX908-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_co_u32_e64 v0, s[4:5], v3, v0 +; GFX908-NEXT: v_addc_co_u32_e64 v1, s[4:5], v4, v1, s[4:5] +; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX908-NEXT: s_branch .LBB4_4 +; GFX908-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX908-NEXT: s_getpc_b64 s[4:5] +; GFX908-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: v_mov_b32_e32 v3, s5 +; GFX908-NEXT: flat_atomic_add_x2 v[3:4], v[2:3], v[0:1] glc +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_branch .LBB4_1 +; GFX908-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX908-NEXT: s_mov_b32 s4, 32 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_lshrrev_b64 v[1:2], s4, v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b32 s6, 32 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX90A-NEXT: s_getpc_b64 s[6:7] +; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX90A-NEXT: s_cmp_eq_u32 s7, s4 +; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], -1 +; GFX90A-NEXT: s_mov_b32 s6, 1 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX90A-NEXT: .LBB4_1: ; %Flow +; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX90A-NEXT: s_mov_b32 s4, 1 +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_add_co_u32_e64 v0, s[4:5], v2, v0 +; GFX90A-NEXT: v_addc_co_u32_e64 v1, s[4:5], v3, v1, s[4:5] +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_branch .LBB4_4 +; GFX90A-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_branch .LBB4_1 +; GFX90A-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX90A-NEXT: s_mov_b32 s4, 32 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshrrev_b64 v[4:5], s4, v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b32 s2, 32 +; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX942-NEXT: s_getpc_b64 s[2:3] +; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX942-NEXT: s_cmp_eq_u32 s3, s0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], -1 +; GFX942-NEXT: s_mov_b32 s2, 1 +; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2 +; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX942-NEXT: .LBB4_1: ; %Flow +; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX942-NEXT: s_mov_b32 s0, 1 +; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0 +; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX942-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX942-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX942-NEXT: s_branch .LBB4_4 +; GFX942-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_branch .LBB4_1 +; GFX942-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX942-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX942-NEXT: s_mov_b32 s0, 32 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshrrev_b64 v[4:5], s0, v[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1100-NEXT: s_mov_b32 s2, 32 +; GFX1100-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX1100-NEXT: s_getpc_b64 s[2:3] +; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX1100-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-NEXT: s_mov_b32 s0, -1 +; GFX1100-NEXT: s_mov_b32 s1, 1 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1100-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX1100-NEXT: .LBB4_1: ; %Flow +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-NEXT: s_mov_b32 s0, 1 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, v2, s0 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: scratch_load_b64 v[3:4], off, s0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: v_add_co_u32 v0, s0, v3, v0 +; GFX1100-NEXT: v_add_co_ci_u32_e64 v1, s0, v4, v1, s0 +; GFX1100-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX1100-NEXT: s_branch .LBB4_4 +; GFX1100-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX1100-NEXT: s_getpc_b64 s[0:1] +; GFX1100-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 +; GFX1100-NEXT: v_mov_b32_e32 v3, s1 +; GFX1100-NEXT: v_mov_b32_e32 v2, s0 +; GFX1100-NEXT: flat_atomic_add_u64 v[3:4], v[2:3], v[0:1] glc +; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_branch .LBB4_1 +; GFX1100-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX1100-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX1100-NEXT: s_mov_b32 s0, 32 +; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_lshrrev_b64 v[1:2], s0, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-LABEL: optnone_atomicrmw_add_i64_expand: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1200-NEXT: s_mov_b32 s2, 32 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX1200-NEXT: s_getpc_b64 s[2:3] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s3, s3 +; GFX1200-NEXT: s_add_co_u32 s2, s2, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1200-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1200-NEXT: s_mov_b32 s0, -1 +; GFX1200-NEXT: s_mov_b32 s1, 1 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX1200-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccnz .LBB4_3 +; GFX1200-NEXT: .LBB4_1: ; %Flow +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1200-NEXT: s_mov_b32 s0, 1 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cmp_ne_u32_e64 s0, v2, s0 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccnz .LBB4_4 +; GFX1200-NEXT: ; %bb.2: ; %atomicrmw.private +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: scratch_load_b64 v[3:4], off, s0 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_add_co_u32 v0, s0, v3, v0 +; GFX1200-NEXT: s_wait_alu 0xf1ff +; GFX1200-NEXT: v_add_co_ci_u32_e64 v1, s0, v4, v1, s0 +; GFX1200-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX1200-NEXT: s_branch .LBB4_4 +; GFX1200-NEXT: .LBB4_3: ; %atomicrmw.global +; GFX1200-NEXT: s_getpc_b64 s[0:1] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s1, s1 +; GFX1200-NEXT: s_add_co_u32 s0, s0, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s1, s1, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_mov_b32_e32 v3, s1 +; GFX1200-NEXT: v_mov_b32_e32 v2, s0 +; GFX1200-NEXT: flat_atomic_add_u64 v[3:4], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1200-NEXT: s_mov_b32 s0, 0 +; GFX1200-NEXT: s_branch .LBB4_1 +; GFX1200-NEXT: .LBB4_4: ; %atomicrmw.phi +; GFX1200-NEXT: ; %bb.5: ; %atomicrmw.end +; GFX1200-NEXT: s_mov_b32 s0, 32 +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_alu 0xf1fe +; GFX1200-NEXT: v_lshrrev_b64 v[1:2], s0, v[3:4] +; GFX1200-NEXT: v_mov_b32_e32 v0, v3 +; GFX1200-NEXT: s_setpc_b64 s[30:31] + %rmw = atomicrmw add ptr addrspacecast (ptr addrspace(1) @global to ptr), i64 %val syncscope("agent") monotonic, align 8 + ret i64 %rmw +} + +; Make sure there is no error on an invalid addrspacecast without optimizations +define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 { +; GFX908-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX908-NEXT: s_mov_b32 s6, 32 +; GFX908-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX908-NEXT: s_getpc_b64 s[6:7] +; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX908-NEXT: s_cmp_eq_u32 s7, s4 +; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX908-NEXT: s_mov_b64 s[4:5], -1 +; GFX908-NEXT: s_mov_b32 s6, 1 +; GFX908-NEXT: v_readfirstlane_b32 s7, v2 +; GFX908-NEXT: s_cmp_lg_u32 s7, s6 +; GFX908-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX908-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX908-NEXT: s_branch .LBB5_3 +; GFX908-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX908-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v4, v3 +; GFX908-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] +; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX908-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX908-NEXT: s_branch .LBB5_6 +; GFX908-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX908-NEXT: s_getpc_b64 s[4:5] +; GFX908-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: v_mov_b32_e32 v3, s5 +; GFX908-NEXT: flat_load_dwordx2 v[3:4], v[2:3] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_branch .LBB5_4 +; GFX908-NEXT: .LBB5_3: ; %Flow +; GFX908-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX908-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX908-NEXT: s_branch .LBB5_6 +; GFX908-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v6, v4 +; GFX908-NEXT: v_mov_b32_e32 v5, v3 +; GFX908-NEXT: v_add_f64 v[3:4], v[5:6], v[0:1] +; GFX908-NEXT: s_getpc_b64 s[6:7] +; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX908-NEXT: v_mov_b32_e32 v8, s7 +; GFX908-NEXT: v_mov_b32_e32 v7, s6 +; GFX908-NEXT: flat_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: v_cmp_eq_u64_e64 s[6:7], v[3:4], v[5:6] +; GFX908-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_cbranch_execnz .LBB5_4 +; GFX908-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX908-NEXT: s_mov_b64 s[4:5], 0 +; GFX908-NEXT: s_branch .LBB5_3 +; GFX908-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX908-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX908-NEXT: s_mov_b32 s4, 32 +; GFX908-NEXT: v_lshrrev_b64 v[1:2], s4, v[3:4] +; GFX908-NEXT: v_mov_b32_e32 v0, v3 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: s_mov_b32 s6, 32 +; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 +; GFX90A-NEXT: s_getpc_b64 s[6:7] +; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX90A-NEXT: s_cmp_eq_u32 s7, s4 +; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], -1 +; GFX90A-NEXT: s_mov_b32 s6, 1 +; GFX90A-NEXT: v_readfirstlane_b32 s7, v2 +; GFX90A-NEXT: s_cmp_lg_u32 s7, s6 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX90A-NEXT: s_branch .LBB5_3 +; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_branch .LBB5_6 +; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX90A-NEXT: s_getpc_b64 s[4:5] +; GFX90A-NEXT: s_add_u32 s4, s4, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s5, s5, global@rel32@hi+12 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_branch .LBB5_4 +; GFX90A-NEXT: .LBB5_3: ; %Flow +; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX90A-NEXT: s_branch .LBB5_6 +; GFX90A-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX90A-NEXT: s_getpc_b64 s[6:7] +; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4 +; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12 +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e64 s[6:7], v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB5_4 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: s_branch .LBB5_3 +; GFX90A-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX90A-NEXT: s_mov_b32 s4, 32 +; GFX90A-NEXT: v_lshrrev_b64 v[4:5], s4, v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX942-NEXT: s_mov_b32 s2, 32 +; GFX942-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX942-NEXT: s_getpc_b64 s[2:3] +; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX942-NEXT: s_cmp_eq_u32 s3, s0 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], -1 +; GFX942-NEXT: s_mov_b32 s2, 1 +; GFX942-NEXT: v_readfirstlane_b32 s3, v2 +; GFX942-NEXT: s_cmp_lg_u32 s3, s2 +; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3] +; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX942-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX942-NEXT: s_branch .LBB5_3 +; GFX942-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] +; GFX942-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX942-NEXT: s_branch .LBB5_6 +; GFX942-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 +; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX942-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_branch .LBB5_4 +; GFX942-NEXT: .LBB5_3: ; %Flow +; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX942-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX942-NEXT: s_branch .LBB5_6 +; GFX942-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX942-NEXT: s_getpc_b64 s[2:3] +; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[2:3] +; GFX942-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[6:7], v[2:5] sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_cmp_eq_u64_e64 s[2:3], v[2:3], v[4:5] +; GFX942-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX942-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_cbranch_execnz .LBB5_4 +; GFX942-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_mov_b64 s[0:1], 0 +; GFX942-NEXT: s_branch .LBB5_3 +; GFX942-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX942-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX942-NEXT: s_mov_b32 s0, 32 +; GFX942-NEXT: v_lshrrev_b64 v[4:5], s0, v[2:3] +; GFX942-NEXT: v_mov_b32_e32 v0, v2 +; GFX942-NEXT: v_mov_b32_e32 v1, v4 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1100-NEXT: s_mov_b32 s2, 32 +; GFX1100-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX1100-NEXT: s_getpc_b64 s[2:3] +; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX1100-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1100-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1100-NEXT: s_mov_b32 s0, -1 +; GFX1100-NEXT: s_mov_b32 s1, 1 +; GFX1100-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1100-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX1100-NEXT: s_branch .LBB5_3 +; GFX1100-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX1100-NEXT: scratch_load_b64 v[3:4], off, s0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] +; GFX1100-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX1100-NEXT: s_branch .LBB5_6 +; GFX1100-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX1100-NEXT: s_getpc_b64 s[0:1] +; GFX1100-NEXT: s_add_u32 s0, s0, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s1, s1, global@rel32@hi+12 +; GFX1100-NEXT: v_mov_b32_e32 v3, s1 +; GFX1100-NEXT: v_mov_b32_e32 v2, s0 +; GFX1100-NEXT: flat_load_b64 v[3:4], v[2:3] +; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_branch .LBB5_4 +; GFX1100-NEXT: .LBB5_3: ; %Flow +; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1100-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX1100-NEXT: s_branch .LBB5_6 +; GFX1100-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_mov_b32_e32 v6, v4 +; GFX1100-NEXT: v_mov_b32_e32 v5, v3 +; GFX1100-NEXT: v_add_f64 v[3:4], v[5:6], v[0:1] +; GFX1100-NEXT: s_getpc_b64 s[2:3] +; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12 +; GFX1100-NEXT: v_mov_b32_e32 v8, s3 +; GFX1100-NEXT: v_mov_b32_e32 v7, s2 +; GFX1100-NEXT: flat_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6] glc +; GFX1100-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_cmp_eq_u64_e64 s1, v[3:4], v[5:6] +; GFX1100-NEXT: s_or_b32 s0, s1, s0 +; GFX1100-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1100-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1100-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX1100-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1100-NEXT: s_mov_b32 s0, 0 +; GFX1100-NEXT: s_branch .LBB5_3 +; GFX1100-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX1100-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX1100-NEXT: s_mov_b32 s0, 32 +; GFX1100-NEXT: v_lshrrev_b64 v[1:2], s0, v[3:4] +; GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-LABEL: optnone_atomicrmw_fadd_f64_expand: +; GFX1200: ; %bb.0: +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: s_wait_expcnt 0x0 +; GFX1200-NEXT: s_wait_samplecnt 0x0 +; GFX1200-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-NEXT: s_wait_kmcnt 0x0 +; GFX1200-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1200-NEXT: s_mov_b32 s2, 32 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX1200-NEXT: s_getpc_b64 s[2:3] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s3, s3 +; GFX1200-NEXT: s_add_co_u32 s2, s2, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cmp_eq_u32 s3, s0 +; GFX1200-NEXT: s_cselect_b32 s0, -1, 0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX1200-NEXT: s_mov_b32 s0, -1 +; GFX1200-NEXT: s_mov_b32 s1, 1 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_cmp_ne_u32_e64 s1, v2, s1 +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s1 +; GFX1200-NEXT: ; implicit-def: $vgpr3_vgpr4 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX1200-NEXT: s_branch .LBB5_3 +; GFX1200-NEXT: .LBB5_1: ; %atomicrmw.private +; GFX1200-NEXT: scratch_load_b64 v[3:4], off, s0 +; GFX1200-NEXT: s_wait_loadcnt 0x0 +; GFX1200-NEXT: v_add_f64_e64 v[0:1], v[3:4], v[0:1] +; GFX1200-NEXT: scratch_store_b64 off, v[0:1], s0 +; GFX1200-NEXT: s_branch .LBB5_6 +; GFX1200-NEXT: .LBB5_2: ; %atomicrmw.global +; GFX1200-NEXT: s_getpc_b64 s[0:1] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s1, s1 +; GFX1200-NEXT: s_add_co_u32 s0, s0, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s1, s1, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_mov_b32_e32 v3, s1 +; GFX1200-NEXT: v_mov_b32_e32 v2, s0 +; GFX1200-NEXT: flat_load_b64 v[3:4], v[2:3] +; GFX1200-NEXT: s_mov_b32 s0, 0 +; GFX1200-NEXT: s_branch .LBB5_4 +; GFX1200-NEXT: .LBB5_3: ; %Flow +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_cbranch_vccnz .LBB5_1 +; GFX1200-NEXT: s_branch .LBB5_6 +; GFX1200-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: v_mov_b32_e32 v6, v4 +; GFX1200-NEXT: v_mov_b32_e32 v5, v3 +; GFX1200-NEXT: v_add_f64_e64 v[3:4], v[5:6], v[0:1] +; GFX1200-NEXT: s_getpc_b64 s[2:3] +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_sext_i32_i16 s3, s3 +; GFX1200-NEXT: s_add_co_u32 s2, s2, global@rel32@lo+12 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_mov_b32_e32 v8, s3 +; GFX1200-NEXT: v_mov_b32_e32 v7, s2 +; GFX1200-NEXT: flat_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-NEXT: v_cmp_eq_u64_e64 s1, v[3:4], v[5:6] +; GFX1200-NEXT: s_or_b32 s0, s1, s0 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1200-NEXT: ; %bb.5: ; %atomicrmw.end1 +; GFX1200-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1200-NEXT: s_mov_b32 s0, 0 +; GFX1200-NEXT: s_branch .LBB5_3 +; GFX1200-NEXT: .LBB5_6: ; %atomicrmw.phi +; GFX1200-NEXT: ; %bb.7: ; %atomicrmw.end +; GFX1200-NEXT: s_mov_b32 s0, 32 +; GFX1200-NEXT: s_wait_alu 0xfffe +; GFX1200-NEXT: v_lshrrev_b64 v[1:2], s0, v[3:4] +; GFX1200-NEXT: v_mov_b32_e32 v0, v3 +; GFX1200-NEXT: s_setpc_b64 s[30:31] + %rmw = atomicrmw fadd ptr addrspacecast (ptr addrspace(1) @global to ptr), double %val monotonic, align 8 + ret double %rmw +} + attributes #0 = { nounwind } +attributes #1 = { noinline nounwind optnone } !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index ff80e05197b0d..db9ce56ecc3cc 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s @@ -5,110 +6,258 @@ declare hidden void @external_void_func_void() #3 -; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_getpc_b64 s[34:35] -; GCN-NEXT: s_add_u32 s34, s34, -; GCN-NEXT: s_addc_u32 s35, s35, -; GCN: s_swappc_b64 s[30:31], s[34:35] - -; GCN-NEXT: #ASMSTART -; GCN-NEXT: #ASMEND -; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35] define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; FLATSCR-LABEL: test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: s_endpgm call void @external_void_func_void() call void asm sideeffect "", ""() #0 call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; MUBUF: buffer_store_dword -; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 -; GCN: v_writelane_b32 v40, s30, 0 -; GCN: v_writelane_b32 v40, s31, 1 -; GCN: v_writelane_b32 v40, s34, 2 -; GCN: v_writelane_b32 v40, s35, 3 - -; GCN: s_swappc_b64 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s35, v40, 3 -; GCN: v_readlane_b32 s34, v40, 2 -; MUBUF-DAG: v_readlane_b32 s31, v40, 1 -; MUBUF-DAG: v_readlane_b32 s30, v40, 0 -; FLATSCR-DAG: v_readlane_b32 s31, v40, 1 -; FLATSCR-DAG: v_readlane_b32 s30, v40, 0 - -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; MUBUF: buffer_load_dword -; FLATSCR: scratch_load_dword -; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 s[30:31] define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { +; MUBUF-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 3 +; MUBUF-NEXT: s_getpc_b64 s[34:35] +; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: v_readlane_b32 s35, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() call void asm sideeffect "", ""() #0 call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_mov_b32 s33, s32 -; MUBUF: buffer_store_dword v40 -; FLATSCR: scratch_store_dword off, v40 -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 -; MUBUF: s_addk_i32 s32, 0x400 -; FLATSCR: s_add_i32 s32, s32, 16 - -; GCN: s_swappc_b64 -; GCN-NEXT: s_swappc_b64 - -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; MUBUF: buffer_load_dword v40 -; FLATSCR: scratch_load_dword v40 -; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] define void @test_func_call_external_void_funcx2() #0 { +; MUBUF-LABEL: test_func_call_external_void_funcx2: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 4 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s34, 2 +; MUBUF-NEXT: v_writelane_b32 v40, s35, 3 +; MUBUF-NEXT: s_getpc_b64 s[34:35] +; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35] +; MUBUF-NEXT: v_readlane_b32 s35, v40, 3 +; MUBUF-NEXT: v_readlane_b32 s34, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 4 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: test_func_call_external_void_funcx2: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2 +; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3 +; FLATSCR-NEXT: s_getpc_b64 s[34:35] +; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35] +; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3 +; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_void() call void @external_void_func_void() ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31: -; GCN: s_waitcnt -; GCN: v_writelane_b32 v0, s30, 0 -; GCN: v_writelane_b32 v0, s31, 1 -; GCN-NEXT: #ASMSTART -; GCN: ; clobber -; GCN-NEXT: #ASMEND -; GCN: v_readlane_b32 s31, v0, 1 -; GCN: v_readlane_b32 s30, v0, 0 -; GCN: s_setpc_b64 s[30:31] define void @void_func_void_clobber_s30_s31() #2 { +; MUBUF-LABEL: void_func_void_clobber_s30_s31: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v0, s31, 1 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s31, v0, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s30_s31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v0, s31, 1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_vcc: -; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_setpc_b64 s[30:31] define hidden void @void_func_void_clobber_vcc() #2 { +; GCN-LABEL: void_func_void_clobber_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{vcc}"() #0 ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b64 s[34:35], vcc -; GCN-NEXT: s_swappc_b64 -; GCN: s_mov_b64 vcc, s[34:35] define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_vcc: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_add_u32 s8, s4, 8 +; FLATSCR-NEXT: s_addc_u32 s9, s5, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_vcc@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_vcc@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def vcc +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b64 s[34:35], vcc +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b64 vcc, s[34:35] +; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 +; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use vcc +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def $0", "={vcc}"() call void @void_func_void_clobber_vcc() %val0 = load volatile i32, ptr addrspace(1) undef @@ -117,22 +266,50 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: -; GCN: s_mov_b32 s33, s31 -; GCN: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s31, s33 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_mayclobber_s31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_mov_b32 s33, s31 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: s_mov_b32 s31, s33 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s31 = call i32 asm sideeffect "; def $0", "={s31}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s31}"(i32 %s31) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: -; GCN: v_mov_b32_e32 v40, v31 -; GCN: s_swappc_b64 -; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_mayclobber_v31: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_mov_b32_e32 v40, v31 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: v_mov_b32_e32 v31, v40 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v31 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %v31 = call i32 asm sideeffect "; def $0", "={v31}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{v31}"(i32 %v31) @@ -140,175 +317,294 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace } ; FIXME: What is the expected behavior for reserved registers here? - -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 - -; GCN: #ASMSTART -; GCN-NEXT: ; def s33 -; GCN-NEXT: #ASMEND - -; GCN-NOT: s33 - -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] -; MUBUF: s_swappc_b64 s[30:31], s[4:5] - -; GCN-NOT: s33 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; use s33 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s33 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s33 = call i32 asm sideeffect "; def $0", "={s33}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s33}"(i32 %s33) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} -; GCN-NOT: s34 - -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; GCN: s_mov_b32 s32, 0 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def s34 -; GCN-NEXT: ;;#ASMEND - -; GCN-NOT: s34 - -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] - -; GCN-NOT: s34 - -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use s34 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s34(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s34 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s34 = call i32 asm sideeffect "; def $0", "={s34}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{s34}"(i32 %s34) ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} - -; MUBUF: s_getpc_b64 s[4:5] -; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 -; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; FLATSCR: s_getpc_b64 s[0:1] -; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 -; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GCN: s_mov_b32 s32, 0 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def v40 -; GCN-NEXT: ;;#ASMEND - -; GCN-NOT: v40 - -; MUBUF: s_swappc_b64 s[30:31], s[4:5] -; FLATSCR: s_swappc_b64 s[30:31], s[0:1] - -; GCN-NOT: v40 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; use v40 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_v40(ptr addrspace(1) %out) #0 { +; FLATSCR-LABEL: test_call_void_func_void_preserves_v40: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %v40 = call i32 asm sideeffect "; def $0", "={v40}"() call void @external_void_func_void() call void asm sideeffect "; use $0", "{v40}"(i32 %v40) ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s33: -; GCN: v_writelane_b32 v0, s33, 0 -; GCN-NEXT: #ASMSTART -; GCN-NEXT: ; clobber -; GCN-NEXT: #ASMEND -; GCN-NEXT: v_readlane_b32 s33, v0, 0 -; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s33() #2 { +; MUBUF-LABEL: void_func_void_clobber_s33: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s33, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s33, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s33, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s33, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 ret void } -; GCN-LABEL: {{^}}void_func_void_clobber_s34: -; GCN: v_writelane_b32 v0, s34, 0 -; GCN-NEXT: #ASMSTART -; GCN-NEXT: ; clobber -; GCN-NEXT: #ASMEND -; GCN-NEXT: v_readlane_b32 s34, v0, 0 -; GCN: s_setpc_b64 define hidden void @void_func_void_clobber_s34() #2 { +; MUBUF-LABEL: void_func_void_clobber_s34: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: v_writelane_b32 v0, s34, 0 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; clobber +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s34, v0, 0 +; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[4:5] +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: void_func_void_clobber_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: v_writelane_b32 v0, s34, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; clobber +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s34, v0, 0 +; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[0:1] +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_s33: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s33@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s33@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s33() ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: -; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 -; GCN-NEXT: s_addc_u32 -; GCN: s_mov_b32 s32, 0 -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { +; FLATSCR-LABEL: test_call_void_func_void_clobber_s34: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FLATSCR-NEXT: s_mov_b32 s14, s12 +; FLATSCR-NEXT: s_mov_b32 s13, s11 +; FLATSCR-NEXT: s_mov_b32 s12, s10 +; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7] +; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5] +; FLATSCR-NEXT: s_getpc_b64 s[16:17] +; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s34@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s34@rel32@hi+12 +; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2 +; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1] +; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3] +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17] +; FLATSCR-NEXT: s_endpgm call void @void_func_void_clobber_s34() ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_func: -; GCN-NOT: s40 -; GCN: v_writelane_b32 v40, s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v40 -; GCN-NOT: s40 define void @callee_saved_sgpr_func() #2 { +; MUBUF-LABEL: callee_saved_sgpr_func: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v40, s4, 3 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s40, 2 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: v_readlane_b32 s40, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v40, 3 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_saved_sgpr_func: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v40, s0, 3 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s40, 2 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_readlane_b32 s40, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v40, 3 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_kernel: -; GCN-NOT: s40 -; GCN: ; def s40 -; GCN-NOT: s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { +; FLATSCR-LABEL: callee_saved_sgpr_kernel: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 @@ -316,16 +612,92 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { } ; First call preserved VGPR is used so it can't be used for SGPR spills. -; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func: -; GCN-NOT: s40 -; GCN: v_writelane_b32 v41, s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 -; GCN: v_readlane_b32 s40, v41 -; GCN-NOT: s40 define void @callee_saved_sgpr_vgpr_func() #2 { +; MUBUF-LABEL: callee_saved_sgpr_vgpr_func: +; MUBUF: ; %bb.0: +; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MUBUF-NEXT: s_mov_b32 s4, s33 +; MUBUF-NEXT: s_mov_b32 s33, s32 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: v_writelane_b32 v41, s4, 3 +; MUBUF-NEXT: s_addk_i32 s32, 0x400 +; MUBUF-NEXT: v_writelane_b32 v41, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v41, s31, 1 +; MUBUF-NEXT: s_getpc_b64 s[4:5] +; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: v_writelane_b32 v41, s40, 2 +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; def v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use s40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: ;;#ASMSTART +; MUBUF-NEXT: ; use v40 +; MUBUF-NEXT: ;;#ASMEND +; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: v_readlane_b32 s40, v41, 2 +; MUBUF-NEXT: v_readlane_b32 s31, v41, 1 +; MUBUF-NEXT: v_readlane_b32 s30, v41, 0 +; MUBUF-NEXT: s_mov_b32 s32, s33 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 3 +; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: s_mov_b64 exec, s[6:7] +; MUBUF-NEXT: s_mov_b32 s33, s4 +; MUBUF-NEXT: s_waitcnt vmcnt(0) +; MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; FLATSCR-LABEL: callee_saved_sgpr_vgpr_func: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, s33 +; FLATSCR-NEXT: s_mov_b32 s33, s32 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3 +; FLATSCR-NEXT: s_add_i32 s32, s32, 16 +; FLATSCR-NEXT: v_writelane_b32 v41, s30, 0 +; FLATSCR-NEXT: v_writelane_b32 v41, s31, 1 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: v_writelane_b32 v41, s40, 2 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: v_readlane_b32 s40, v41, 2 +; FLATSCR-NEXT: v_readlane_b32 s31, v41, 1 +; FLATSCR-NEXT: v_readlane_b32 s30, v41, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s33 +; FLATSCR-NEXT: v_readlane_b32 s0, v41, 3 +; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] +; FLATSCR-NEXT: s_mov_b32 s33, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0 call void @external_void_func_void() @@ -334,15 +706,30 @@ define void @callee_saved_sgpr_vgpr_func() #2 { ret void } -; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel: -; GCN-NOT: s40 -; GCN: ; def s40 -; GCN-NOT: s40 -; GCN: s_swappc_b64 -; GCN-NOT: s40 -; GCN: ; use s40 -; GCN-NOT: s40 define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 { +; FLATSCR-LABEL: callee_saved_sgpr_vgpr_kernel: +; FLATSCR: ; %bb.0: +; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 +; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; FLATSCR-NEXT: s_getpc_b64 s[0:1] +; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: s_mov_b32 s32, 0 +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; def v32 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: v_mov_b32_e32 v40, v32 +; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use s40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: ;;#ASMSTART +; FLATSCR-NEXT: ; use v40 +; FLATSCR-NEXT: ;;#ASMEND +; FLATSCR-NEXT: s_endpgm %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir index 9e46c58b6b5a9..76eaf350301e4 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.mir @@ -18,3 +18,16 @@ body: | $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4) ... + +# CHECK: meta: ; @meta +# CHECK: ; wave barrier +# CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] +# CHECK: ; codeLenInByte = 4 +--- +name: meta +tracksRegLiveness: true +body: | + bb.0: + + WAVE_BARRIER +... diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 3db2b6ed9ab4b..807eaf2160b3c 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -96,8 +96,8 @@ body: | %12:vgpr_32 = V_AND_B32_e64 %8, %8, implicit $exec FLAT_STORE_DWORD %19, %12, 0, 0, implicit $exec, implicit $flat_scr - %13:vgpr_32 = V_AND_B32_e64 %16, %16, implicit $exec - FLAT_STORE_DWORD %19, %13, 0, 0, implicit $exec, implicit $flat_scr + %21:vgpr_32 = V_AND_B32_e64 %16, %16, implicit $exec + FLAT_STORE_DWORD %19, %21, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 @@ -191,6 +191,7 @@ body: | name: v_fold_ashr_imm_regimm_32 tracksRegLiveness: true +isSSA: true liveins: - { reg: '$sgpr0_sgpr1', virtual-reg: '%0' } - { reg: '$vgpr0', virtual-reg: '%2' } @@ -232,8 +233,8 @@ body: | %14:vgpr_32 = V_ASHR_I32_e64 7, %29, implicit $exec FLAT_STORE_DWORD %20, %14, 0, 0, implicit $exec, implicit $flat_scr - %15:vgpr_32 = V_ASHR_I32_e64 %27, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, implicit $exec, implicit $flat_scr + %33:vgpr_32 = V_ASHR_I32_e64 %27, %24, implicit $exec + FLAT_STORE_DWORD %20, %33, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_ASHR_I32_e64 %6, 4, implicit $exec FLAT_STORE_DWORD %20, %22, 0, 0, implicit $exec, implicit $flat_scr @@ -356,8 +357,8 @@ body: | %14:vgpr_32 = V_LSHR_B32_e64 7, %29, implicit $exec FLAT_STORE_DWORD %20, %14, 0, 0, implicit $exec, implicit $flat_scr - %15:vgpr_32 = V_LSHR_B32_e64 %27, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, implicit $exec, implicit $flat_scr + %33:vgpr_32 = V_LSHR_B32_e64 %27, %24, implicit $exec + FLAT_STORE_DWORD %20, %33, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_LSHR_B32_e64 %6, 4, implicit $exec FLAT_STORE_DWORD %20, %22, 0, 0, implicit $exec, implicit $flat_scr @@ -497,8 +498,8 @@ body: | # GCN: %17:vgpr_32 = V_MOV_B32_e32 1234567, implicit $exec # GCN: FLAT_STORE_DWORD %10, %17, -# GCN: %3:vgpr_32 = V_MOV_B32_e32 63, implicit $exec -# GCN: FLAT_STORE_DWORD %10, %3, +# GCN: %18:vgpr_32 = V_MOV_B32_e32 63, implicit $exec +# GCN: FLAT_STORE_DWORD %10, %18, name: v_fold_or_imm_regimm_32 alignment: 0 @@ -536,8 +537,8 @@ body: | FLAT_STORE_DWORD %19, %11, 0, 0, implicit $exec, implicit $flat_scr %12:vgpr_32 = V_OR_B32_e64 %8, %8, implicit $exec FLAT_STORE_DWORD %19, %12, 0, 0, implicit $exec, implicit $flat_scr - %13:vgpr_32 = V_OR_B32_e64 %16, %16, implicit $exec - FLAT_STORE_DWORD %19, %13, 0, 0, implicit $exec, implicit $flat_scr + %21:vgpr_32 = V_OR_B32_e64 %16, %16, implicit $exec + FLAT_STORE_DWORD %19, %21, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -689,24 +690,24 @@ body: | # GCN: %19:vgpr_32 = V_MOV_B32_e32 24, implicit $exec # GCN: FLAT_STORE_DWORD %10, %19, -# GCN: %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec -# GCN: FLAT_STORE_DWORD %10, %3, - -# GCN: %20:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec +# GCN: %20:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # GCN: FLAT_STORE_DWORD %10, %20, -# GCN: %21:vgpr_32 = V_MOV_B32_e32 1, implicit $exec +# GCN: %21:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec # GCN: FLAT_STORE_DWORD %10, %21, -# GCN: %22:vgpr_32 = V_MOV_B32_e32 2, implicit $exec +# GCN: %22:vgpr_32 = V_MOV_B32_e32 1, implicit $exec # GCN: FLAT_STORE_DWORD %10, %22, -# GCN: %23:vgpr_32 = V_MOV_B32_e32 7927808, implicit $exec +# GCN: %23:vgpr_32 = V_MOV_B32_e32 2, implicit $exec # GCN: FLAT_STORE_DWORD %10, %23, -# GCN: %24:vgpr_32 = V_MOV_B32_e32 -8, implicit $exec +# GCN: %24:vgpr_32 = V_MOV_B32_e32 7927808, implicit $exec # GCN: FLAT_STORE_DWORD %10, %24, +# GCN: %25:vgpr_32 = V_MOV_B32_e32 -8, implicit $exec +# GCN: FLAT_STORE_DWORD %10, %25, + name: v_fold_shl_imm_regimm_32 alignment: 0 exposesReturnsTwice: false @@ -745,8 +746,8 @@ body: | FLAT_STORE_DWORD %20, %13, 0, 0, implicit $exec, implicit $flat_scr %14:vgpr_32 = V_LSHL_B32_e64 12, %7, implicit $exec FLAT_STORE_DWORD %20, %14, 0, 0, implicit $exec, implicit $flat_scr - %15:vgpr_32 = V_LSHL_B32_e64 12, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, implicit $exec, implicit $flat_scr + %30:vgpr_32 = V_LSHL_B32_e64 12, %24, implicit $exec + FLAT_STORE_DWORD %20, %30, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_LSHL_B32_e64 %6, 12, implicit $exec FLAT_STORE_DWORD %20, %22, 0, 0, implicit $exec, implicit $flat_scr %23:vgpr_32 = V_LSHL_B32_e64 %6, 32, implicit $exec @@ -926,3 +927,37 @@ body: | S_ENDPGM 0, implicit %3 ... + +--- +name: constant_s_xor_b32_uses_subreg +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_s_xor_b32_uses_subreg + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 47 + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]] + %0:sreg_64 = S_MOV_B64 32 + %1:sreg_64 = S_MOV_B64 15 + %2:sgpr_32 = S_XOR_B32 %0.sub0, %1.sub0, implicit-def dead $scc + %3:sgpr_32 = S_XOR_B32 %0.sub1, %1.sub1, implicit-def dead $scc + S_ENDPGM 0, implicit %2, implicit %3 + +... + +--- +name: constant_v_or_b32_uses_subreg +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: constant_v_or_b32_uses_subreg + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 268435455, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec + ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]] + %0:vreg_64 = V_MOV_B64_PSEUDO 18446744069683019775, implicit $exec + %1:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + %2:vgpr_32 = V_OR_B32_e32 %0.sub0, %1.sub0, implicit $exec + %3:vgpr_32 = V_OR_B32_e32 %0.sub1, %1.sub1, implicit $exec + S_ENDPGM 0, implicit %2, implicit %3 + +... diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir index 12e8d24cb3675..ade7b4266e9e6 100644 --- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir +++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir @@ -2162,8 +2162,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2178,8 +2177,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2315,8 +2313,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2332,8 +2329,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2469,8 +2465,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2485,8 +2480,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4 @@ -2622,8 +2616,7 @@ body: | ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc - ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec + ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 12352, 0, implicit $exec ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 @@ -2639,8 +2632,7 @@ body: | ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32 ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc - ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec - ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec + ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 $sgpr33, 4160, 0, implicit $exec ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34 ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir new file mode 100644 index 0000000000000..e71516e74f17e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-literal-multiple-gfx10.mir @@ -0,0 +1,66 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-fold-operands -o - %s | FileCheck %s + +# The same literal may be used multiple times in different operands, +# as long as it is the same value. + +--- +name: fold_multiple_same_literal_use_0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: fold_multiple_same_literal_use_0 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, 1178657792, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657792 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, %2, 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... + +--- +name: fold_multiple_same_literal_use_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: fold_multiple_same_literal_use_1 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, 1178657792, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657792 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... + +--- +name: no_fold_multiple_same_literal_different_value +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: no_fold_multiple_same_literal_different_value + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1178657793, implicit $exec + ; CHECK-NEXT: [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, [[V_MOV_B32_e32_]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_DIV_SCALE_F32_e64_]] + %0:vgpr_32 = COPY $vgpr0 + %1:sreg_32 = S_MOV_B32 1178657793 + %2:vgpr_32 = COPY %1 + %3:vgpr_32, %4:sreg_32_xm0_xexec = V_DIV_SCALE_F32_e64 0, 1178657792, 0, 1178657792, 0, %2, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir index 268a8a4783d24..edd5d0a119e5f 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-vgpr-copy.mir @@ -55,8 +55,7 @@ body: | # GCN-LABEL: name: fma_sgpr_sgpr_use # GCN: %0:sgpr_32 = IMPLICIT_DEF -# GCN-NEXT: %2:vgpr_32 = V_MOV_B32_e32 1234567, implicit $exec -# GCN-NEXT: %3:vgpr_32 = V_FMAC_F32_e64 0, %0, 0, 1234567, 0, %2, 0, 0, implicit $mode, implicit $exec +# GCN: %3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, 1234567, 0, 1234567, 0, 0, implicit $mode, implicit $exec --- name: fma_sgpr_sgpr_use body: | diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll index e1ba6489a5317..f0609f62a9024 100644 --- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll @@ -1,36 +1,66 @@ -; RUN: not llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsad < %s | FileCheck %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s + +; Check illegal casts are codegened as poison, and not an error. -; ERROR: error: :0:0: in function use_group_to_global_addrspacecast void (ptr addrspace(3)): invalid addrspacecast define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) { +; CHECK-LABEL: use_group_to_global_addrspacecast: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: flat_store_dword v[0:1], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_endpgm %stof = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(1) store volatile i32 0, ptr addrspace(1) %stof ret void } -; ERROR: error: :0:0: in function use_local_to_constant32bit_addrspacecast void (ptr addrspace(3)): invalid addrspacecast define amdgpu_kernel void @use_local_to_constant32bit_addrspacecast(ptr addrspace(3) %ptr) { +; CHECK-LABEL: use_local_to_constant32bit_addrspacecast: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_load_dword s0, s[0:1], 0x0 +; CHECK-NEXT: s_endpgm %stof = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(6) %load = load volatile i32, ptr addrspace(6) %stof ret void } -; ERROR: error: :0:0: in function use_constant32bit_to_local_addrspacecast void (ptr addrspace(6)): invalid addrspacecast define amdgpu_kernel void @use_constant32bit_to_local_addrspacecast(ptr addrspace(6) %ptr) { +; CHECK-LABEL: use_constant32bit_to_local_addrspacecast: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: s_endpgm %cast = addrspacecast ptr addrspace(6) %ptr to ptr addrspace(3) %load = load volatile i32, ptr addrspace(3) %cast ret void } -; ERROR: error: :0:0: in function use_local_to_42_addrspacecast void (ptr addrspace(3)): invalid addrspacecast define amdgpu_kernel void @use_local_to_42_addrspacecast(ptr addrspace(3) %ptr) { +; SDAG-LABEL: use_local_to_42_addrspacecast: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: use_local_to_42_addrspacecast: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_endpgm %cast = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(42) store volatile ptr addrspace(42) %cast, ptr addrspace(1) null ret void } -; ERROR: error: :0:0: in function use_42_to_local_addrspacecast void (ptr addrspace(42)): invalid addrspacecast define amdgpu_kernel void @use_42_to_local_addrspacecast(ptr addrspace(42) %ptr) { +; CHECK-LABEL: use_42_to_local_addrspacecast: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: s_endpgm %cast = addrspacecast ptr addrspace(42) %ptr to ptr addrspace(3) %load = load volatile i32, ptr addrspace(3) %cast ret void diff --git a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll index 3eb9d474ec030..f961e857f39e5 100644 --- a/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll +++ b/llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll @@ -79,9 +79,9 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL: ; %bb.0: ; %bb ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_clause 0x1 -; GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off -; GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off +; GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16 +; GISEL-NEXT: v_mov_b32_e32 v8, 0 ; GISEL-NEXT: s_mov_b32 s20, 0 ; GISEL-NEXT: s_mov_b32 s3, exec_lo ; GISEL-NEXT: s_mov_b32 s21, s20 @@ -97,19 +97,19 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL-NEXT: s_mov_b32 s11, s20 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; GISEL-NEXT: v_readfirstlane_b32 s12, v2 -; GISEL-NEXT: v_readfirstlane_b32 s13, v3 -; GISEL-NEXT: v_readfirstlane_b32 s14, v4 -; GISEL-NEXT: v_readfirstlane_b32 s15, v5 -; GISEL-NEXT: v_readfirstlane_b32 s16, v6 -; GISEL-NEXT: v_readfirstlane_b32 s17, v7 -; GISEL-NEXT: v_readfirstlane_b32 s18, v8 -; GISEL-NEXT: v_readfirstlane_b32 s19, v9 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[2:3] -; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[4:5] -; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[6:7] +; GISEL-NEXT: v_readfirstlane_b32 s12, v4 +; GISEL-NEXT: v_readfirstlane_b32 s13, v5 +; GISEL-NEXT: v_readfirstlane_b32 s14, v6 +; GISEL-NEXT: v_readfirstlane_b32 s15, v7 +; GISEL-NEXT: v_readfirstlane_b32 s16, v0 +; GISEL-NEXT: v_readfirstlane_b32 s17, v1 +; GISEL-NEXT: v_readfirstlane_b32 s18, v2 +; GISEL-NEXT: v_readfirstlane_b32 s19, v3 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[12:13], v[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e64 s0, s[14:15], v[6:7] +; GISEL-NEXT: v_cmp_eq_u64_e64 s1, s[16:17], v[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[8:9] +; GISEL-NEXT: v_cmp_eq_u64_e64 s2, s[18:19], v[2:3] ; GISEL-NEXT: s_and_b32 s0, vcc_lo, s0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GISEL-NEXT: s_and_b32 s0, s0, s1 @@ -117,29 +117,31 @@ define void @issue92561(ptr addrspace(1) %arg) { ; GISEL-NEXT: s_and_b32 s0, s0, s2 ; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_and_saveexec_b32 s0, s0 -; GISEL-NEXT: image_sample_c_lz v1, [v0, v0, v0, v0], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 -; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: image_sample_c_lz v9, [v8, v8, v8, v8], s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GISEL-NEXT: s_cbranch_execnz .LBB0_1 ; GISEL-NEXT: ; %bb.2: ; GISEL-NEXT: s_mov_b32 exec_lo, s3 -; GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0 -; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0x7fc00000 +; GISEL-NEXT: v_mov_b32_e32 v2, 1.0 ; GISEL-NEXT: s_clause 0x2 -; GISEL-NEXT: image_sample_c_lz v0, [v2, v2, v0, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; GISEL-NEXT: image_sample_c_lz v3, [v2, v3, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; GISEL-NEXT: image_sample_c_lz v4, [v2, v2, v2, v2], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: image_sample_c_lz v0, [v1, v1, v0, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: image_sample_c_lz v2, [v1, v2, v1, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GISEL-NEXT: image_sample_c_lz v3, [v1, v1, v1, v1], s[4:11], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GISEL-NEXT: s_waitcnt vmcnt(2) -; GISEL-NEXT: v_add_f32_e32 v0, v1, v0 +; GISEL-NEXT: v_add_f32_e32 v0, v9, v0 ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_mov_b32 v3, v2 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_add_f32_e32 v0, v2, v0 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_add_f32_e32 v0, v4, v0 +; GISEL-NEXT: v_add_f32_e32 v0, v3, v0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_mul_f32_e32 v1, 0x3e800000, v0 -; GISEL-NEXT: image_store v[1:3], [v2, v2], s[4:11] dim:SQ_RSRC_IMG_2D unorm +; GISEL-NEXT: v_mul_f32_e32 v0, 0x3e800000, v0 +; GISEL-NEXT: image_store v[0:2], [v1, v1], s[4:11] dim:SQ_RSRC_IMG_2D unorm ; GISEL-NEXT: s_setpc_b64 s[30:31] bb: %descriptor = load <8 x i32>, ptr addrspace(1) %arg, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 893b9fa6fb40d..d7f54f3b8e9e2 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -145,11 +145,11 @@ ; GCN-O0-NEXT: Post RA hazard recognizer ; GCN-O0-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O0-NEXT: Branch relaxation pass -; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O0-NEXT: Register Usage Information Collector Pass ; GCN-O0-NEXT: Remove Loads Into Fake Uses ; GCN-O0-NEXT: Live DEBUG_VALUE analysis ; GCN-O0-NEXT: Machine Sanitizer Binary Metadata +; GCN-O0-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O0-NEXT: Machine Optimization Remark Emitter ; GCN-O0-NEXT: Stack Frame Layout Analysis @@ -430,11 +430,11 @@ ; GCN-O1-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O1-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-NEXT: Branch relaxation pass -; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-NEXT: Register Usage Information Collector Pass ; GCN-O1-NEXT: Remove Loads Into Fake Uses ; GCN-O1-NEXT: Live DEBUG_VALUE analysis ; GCN-O1-NEXT: Machine Sanitizer Binary Metadata +; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-NEXT: Machine Optimization Remark Emitter ; GCN-O1-NEXT: Stack Frame Layout Analysis @@ -743,11 +743,11 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O1-OPTS-NEXT: AMDGPU Insert Delay ALU ; GCN-O1-OPTS-NEXT: Branch relaxation pass -; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-OPTS-NEXT: Register Usage Information Collector Pass ; GCN-O1-OPTS-NEXT: Remove Loads Into Fake Uses ; GCN-O1-OPTS-NEXT: Live DEBUG_VALUE analysis ; GCN-O1-OPTS-NEXT: Machine Sanitizer Binary Metadata +; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis @@ -1062,11 +1062,11 @@ ; GCN-O2-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O2-NEXT: AMDGPU Insert Delay ALU ; GCN-O2-NEXT: Branch relaxation pass -; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O2-NEXT: Register Usage Information Collector Pass ; GCN-O2-NEXT: Remove Loads Into Fake Uses ; GCN-O2-NEXT: Live DEBUG_VALUE analysis ; GCN-O2-NEXT: Machine Sanitizer Binary Metadata +; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O2-NEXT: Machine Optimization Remark Emitter ; GCN-O2-NEXT: Stack Frame Layout Analysis @@ -1394,11 +1394,11 @@ ; GCN-O3-NEXT: AMDGPU Insert waits for SGPR read hazards ; GCN-O3-NEXT: AMDGPU Insert Delay ALU ; GCN-O3-NEXT: Branch relaxation pass -; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O3-NEXT: Register Usage Information Collector Pass ; GCN-O3-NEXT: Remove Loads Into Fake Uses ; GCN-O3-NEXT: Live DEBUG_VALUE analysis ; GCN-O3-NEXT: Machine Sanitizer Binary Metadata +; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments Prolog ; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis ; GCN-O3-NEXT: Machine Optimization Remark Emitter ; GCN-O3-NEXT: Stack Frame Layout Analysis diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir index d070a8ef5dd2d..ddeb45a48a6ee 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir @@ -344,3 +344,222 @@ body: | %3:vgpr_32 = V_FMA_F32_e64 0, %0, 0, %1, 0, %2.sub1, 0, 0, implicit $mode, implicit $exec SI_RETURN_TO_EPILOG %3 ... + +--- +name: fold_aimm_virtual +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_virtual + ; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_ACCVGPR_WRITE_B32_e64_]] + %0:agpr_32 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec + %1:agpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 + +... + +--- +name: fold_aimm_virtual_copy_to_vgpr +body: | + bb.0: + + ; GCN-LABEL: name: fold_aimm_virtual_copy_to_vgpr + ; GCN: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 64, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B32_e32_]] + %0:agpr_32 = V_ACCVGPR_WRITE_B32_e64 64, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 + +... + +--- +name: fold_v_mov_b64_64_sub0_to_vgpr_32 +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_mov_b64_64_sub0_to_vgpr_32 + ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1412567312, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] + %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + %1:vgpr_32 = COPY killed %0.sub0 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_v_mov_b64_64_sub1_to_vgpr_32 +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_mov_b64_64_sub1_to_vgpr_32 + ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 305419896, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] + %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + %1:vgpr_32 = COPY killed %0.sub1 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_v_mov_b64_64 +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_mov_b64_64 + ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] + %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + %1:vreg_64_align2 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 + +... + +--- +name: fold_v_mov_b64_64_to_unaligned +body: | + bb.0: + ; GCN-LABEL: name: fold_v_mov_b64_64_to_unaligned + ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] + %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec + %1:vreg_64 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 +... + +--- +name: fold_v_mov_b64_pseudo_64_to_unaligned +body: | + bb.0: + ; GCN-LABEL: name: fold_v_mov_b64_pseudo_64_to_unaligned + ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]] + %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec + %1:vreg_64 = COPY killed %0 + SI_RETURN_TO_EPILOG implicit %1 +... + +--- +name: fold_s_brev_b32_simm_virtual_0 +body: | + bb.0: + + ; GCN-LABEL: name: fold_s_brev_b32_simm_virtual_0 + ; GCN: [[S_BREV_B32_:%[0-9]+]]:sreg_32 = S_BREV_B32 1 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648 + ; GCN-NEXT: SI_RETURN_TO_EPILOG + %0:sreg_32 = S_BREV_B32 1 + %1:sreg_32 = COPY killed %0 + SI_RETURN_TO_EPILOG + +... + +--- +name: fold_s_brev_b32_simm_virtual_1 +body: | + bb.0: + + ; GCN-LABEL: name: fold_s_brev_b32_simm_virtual_1 + ; GCN: [[S_BREV_B32_:%[0-9]+]]:sreg_32 = S_BREV_B32 -64 + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 67108863 + ; GCN-NEXT: SI_RETURN_TO_EPILOG + %0:sreg_32 = S_BREV_B32 -64 + %1:sreg_32 = COPY killed %0 + SI_RETURN_TO_EPILOG + +... + +--- +name: fold_v_bfrev_b32_e32_imm +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_bfrev_b32_e32_imm + ; GCN: [[V_BFREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] + %0:vgpr_32 = V_BFREV_B32_e32 1, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_v_bfrev_b32_e64_imm +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_bfrev_b32_e64_imm + ; GCN: [[V_BFREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFREV_B32_e64 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] + %0:vgpr_32 = V_BFREV_B32_e64 1, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_s_not_b32_simm_virtual_0 +body: | + bb.0: + + ; GCN-LABEL: name: fold_s_not_b32_simm_virtual_0 + ; GCN: [[S_NOT_B32_:%[0-9]+]]:sreg_32 = S_NOT_B32 1, implicit-def $scc + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2 + ; GCN-NEXT: SI_RETURN_TO_EPILOG + %0:sreg_32 = S_NOT_B32 1, implicit-def $scc + %1:sreg_32 = COPY killed %0 + SI_RETURN_TO_EPILOG + +... + +--- +name: fold_s_not_b32_simm_virtual_1 +body: | + bb.0: + + ; GCN-LABEL: name: fold_s_not_b32_simm_virtual_1 + ; GCN: [[S_NOT_B32_:%[0-9]+]]:sreg_32 = S_NOT_B32 -64, implicit-def $scc + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: SI_RETURN_TO_EPILOG + %0:sreg_32 = S_NOT_B32 -64, implicit-def $scc + %1:sreg_32 = COPY killed %0 + SI_RETURN_TO_EPILOG + +... + +--- +name: fold_v_not_b32_e32_imm +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_not_b32_e32_imm + ; GCN: [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] + %0:vgpr_32 = V_NOT_B32_e32 1, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG %1 + +... + +--- +name: fold_v_not_b32_e64_imm +body: | + bb.0: + + ; GCN-LABEL: name: fold_v_not_b32_e64_imm + ; GCN: [[V_NOT_B32_e64_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e64 1, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec + ; GCN-NEXT: SI_RETURN_TO_EPILOG [[V_MOV_B32_e32_]] + %0:vgpr_32 = V_NOT_B32_e64 1, implicit $exec + %1:vgpr_32 = COPY killed %0 + SI_RETURN_TO_EPILOG %1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir index ea8e2edb80c7e..6d2f4e76840ae 100644 --- a/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-fold-reg-sequence-subreg.mir @@ -162,7 +162,7 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub2 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY4]] %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 @@ -189,7 +189,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub0_sub1, %subreg.sub0, [[COPY1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 @@ -212,7 +212,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]].sub0_sub1, %subreg.sub0, [[COPY1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:vreg_128 = COPY $vgpr4_vgpr5_vgpr6_vgpr7 @@ -285,7 +285,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1, [[V_MOV_B32_e32_]], %subreg.sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_64 = COPY $vgpr1_vgpr2 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -311,8 +311,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]], implicit [[COPY2]], implicit [[COPY3]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 @@ -340,7 +340,7 @@ body: | ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -367,8 +367,8 @@ body: | ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY]].sub0, %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY1]], implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -420,7 +420,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -444,7 +444,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -468,7 +468,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -492,7 +492,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub0 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -516,7 +516,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub0_sub1, [[COPY1]].sub0_sub1, %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -540,7 +540,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]].sub0_sub1, %subreg.sub2_sub3, [[COPY]].sub2_sub3, %subreg.sub0_sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -564,7 +564,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]].sub0_sub1, %subreg.sub2_sub3, [[COPY]].sub2_sub3, %subreg.sub0_sub1 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_128 = COPY $vgpr5_vgpr6_vgpr7_vgpr8 @@ -588,7 +588,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub1_sub2_sub3, %subreg.sub0_sub1_sub2, [[COPY1]].sub1, %subreg.sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_64 = COPY $vgpr5_vgpr6 @@ -615,9 +615,9 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]].sub1_sub2_sub3, %subreg.sub0_sub1_sub2, [[COPY1]].sub1, %subreg.sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]], implicit [[COPY5]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 @@ -650,12 +650,12 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr5_vgpr6 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]].sub2_sub3, %subreg.sub4_sub5, [[COPY]].sub1_sub2, %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub2 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub4 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]].sub1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub2 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub3 ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY6]], implicit [[COPY7]] %0:vreg_128 = COPY $vgpr1_vgpr2_vgpr3_vgpr4 %1:vreg_64 = COPY $vgpr5_vgpr6 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll index a6a0b88dce125..b008f397318e8 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll @@ -1,7 +1,13 @@ -; RUN: not --crash opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ +; RUN: | FileCheck --match-full-lines --implicit-check-not='declare' %s -; CHECK: function declaration may only have a unique !dbg attachment -; CHECK-NEXT: ptr @0 +; Confirms we do not leave behind a declaration which references the same +; DISubprogram metadata. + +; CHECK: define amdgpu_kernel void @preload_block_count_x{{.*}} !dbg ![[#]] !max_work_group_size ![[#]] { +; CHECK: declare void @0{{.*}} #[[#]] +; CHECK: declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #[[#]] +; CHECK: declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #[[#]] define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) !dbg !4 !max_work_group_size !7 { %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll index d2b960fe43f84..0d6bccad89d82 100644 --- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll @@ -1,13 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s -; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs: -; CHECK-DAG: v_writelane_b32 v0, s98, 63 -; CHECK-DAG: v_writelane_b32 v1, s99, 0 -; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v1, 0 -; CHECK-DAG: v_readlane_b32 s98, v0, 63 - define void @spill_more_than_wavesize_csr_sgprs() { +; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v0, s35, 0 +; CHECK-NEXT: v_writelane_b32 v0, s36, 1 +; CHECK-NEXT: v_writelane_b32 v0, s37, 2 +; CHECK-NEXT: v_writelane_b32 v0, s38, 3 +; CHECK-NEXT: v_writelane_b32 v0, s39, 4 +; CHECK-NEXT: v_writelane_b32 v0, s40, 5 +; CHECK-NEXT: v_writelane_b32 v0, s41, 6 +; CHECK-NEXT: v_writelane_b32 v0, s42, 7 +; CHECK-NEXT: v_writelane_b32 v0, s43, 8 +; CHECK-NEXT: v_writelane_b32 v0, s44, 9 +; CHECK-NEXT: v_writelane_b32 v0, s45, 10 +; CHECK-NEXT: v_writelane_b32 v0, s46, 11 +; CHECK-NEXT: v_writelane_b32 v0, s47, 12 +; CHECK-NEXT: v_writelane_b32 v0, s48, 13 +; CHECK-NEXT: v_writelane_b32 v0, s49, 14 +; CHECK-NEXT: v_writelane_b32 v0, s50, 15 +; CHECK-NEXT: v_writelane_b32 v0, s51, 16 +; CHECK-NEXT: v_writelane_b32 v0, s52, 17 +; CHECK-NEXT: v_writelane_b32 v0, s53, 18 +; CHECK-NEXT: v_writelane_b32 v0, s54, 19 +; CHECK-NEXT: v_writelane_b32 v0, s55, 20 +; CHECK-NEXT: v_writelane_b32 v0, s56, 21 +; CHECK-NEXT: v_writelane_b32 v0, s57, 22 +; CHECK-NEXT: v_writelane_b32 v0, s58, 23 +; CHECK-NEXT: v_writelane_b32 v0, s59, 24 +; CHECK-NEXT: v_writelane_b32 v0, s60, 25 +; CHECK-NEXT: v_writelane_b32 v0, s61, 26 +; CHECK-NEXT: v_writelane_b32 v0, s62, 27 +; CHECK-NEXT: v_writelane_b32 v0, s63, 28 +; CHECK-NEXT: v_writelane_b32 v0, s64, 29 +; CHECK-NEXT: v_writelane_b32 v0, s65, 30 +; CHECK-NEXT: v_writelane_b32 v0, s66, 31 +; CHECK-NEXT: v_writelane_b32 v0, s67, 32 +; CHECK-NEXT: v_writelane_b32 v0, s68, 33 +; CHECK-NEXT: v_writelane_b32 v0, s69, 34 +; CHECK-NEXT: v_writelane_b32 v0, s70, 35 +; CHECK-NEXT: v_writelane_b32 v0, s71, 36 +; CHECK-NEXT: v_writelane_b32 v0, s72, 37 +; CHECK-NEXT: v_writelane_b32 v0, s73, 38 +; CHECK-NEXT: v_writelane_b32 v0, s74, 39 +; CHECK-NEXT: v_writelane_b32 v0, s75, 40 +; CHECK-NEXT: v_writelane_b32 v0, s76, 41 +; CHECK-NEXT: v_writelane_b32 v0, s77, 42 +; CHECK-NEXT: v_writelane_b32 v0, s78, 43 +; CHECK-NEXT: v_writelane_b32 v0, s79, 44 +; CHECK-NEXT: v_writelane_b32 v0, s80, 45 +; CHECK-NEXT: v_writelane_b32 v0, s81, 46 +; CHECK-NEXT: v_writelane_b32 v0, s82, 47 +; CHECK-NEXT: v_writelane_b32 v0, s83, 48 +; CHECK-NEXT: v_writelane_b32 v0, s84, 49 +; CHECK-NEXT: v_writelane_b32 v0, s85, 50 +; CHECK-NEXT: v_writelane_b32 v0, s86, 51 +; CHECK-NEXT: v_writelane_b32 v0, s87, 52 +; CHECK-NEXT: v_writelane_b32 v0, s88, 53 +; CHECK-NEXT: v_writelane_b32 v0, s89, 54 +; CHECK-NEXT: v_writelane_b32 v0, s90, 55 +; CHECK-NEXT: v_writelane_b32 v0, s91, 56 +; CHECK-NEXT: v_writelane_b32 v0, s92, 57 +; CHECK-NEXT: v_writelane_b32 v0, s93, 58 +; CHECK-NEXT: v_writelane_b32 v0, s94, 59 +; CHECK-NEXT: v_writelane_b32 v0, s95, 60 +; CHECK-NEXT: v_writelane_b32 v1, s99, 0 +; CHECK-NEXT: v_writelane_b32 v0, s96, 61 +; CHECK-NEXT: v_writelane_b32 v1, s100, 1 +; CHECK-NEXT: v_writelane_b32 v0, s97, 62 +; CHECK-NEXT: v_writelane_b32 v1, s101, 2 +; CHECK-NEXT: v_writelane_b32 v0, s98, 63 +; CHECK-NEXT: v_writelane_b32 v1, s102, 3 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s102, v1, 3 +; CHECK-NEXT: v_readlane_b32 s101, v1, 2 +; CHECK-NEXT: v_readlane_b32 s100, v1, 1 +; CHECK-NEXT: v_readlane_b32 s99, v1, 0 +; CHECK-NEXT: v_readlane_b32 s98, v0, 63 +; CHECK-NEXT: v_readlane_b32 s97, v0, 62 +; CHECK-NEXT: v_readlane_b32 s96, v0, 61 +; CHECK-NEXT: v_readlane_b32 s95, v0, 60 +; CHECK-NEXT: v_readlane_b32 s94, v0, 59 +; CHECK-NEXT: v_readlane_b32 s93, v0, 58 +; CHECK-NEXT: v_readlane_b32 s92, v0, 57 +; CHECK-NEXT: v_readlane_b32 s91, v0, 56 +; CHECK-NEXT: v_readlane_b32 s90, v0, 55 +; CHECK-NEXT: v_readlane_b32 s89, v0, 54 +; CHECK-NEXT: v_readlane_b32 s88, v0, 53 +; CHECK-NEXT: v_readlane_b32 s87, v0, 52 +; CHECK-NEXT: v_readlane_b32 s86, v0, 51 +; CHECK-NEXT: v_readlane_b32 s85, v0, 50 +; CHECK-NEXT: v_readlane_b32 s84, v0, 49 +; CHECK-NEXT: v_readlane_b32 s83, v0, 48 +; CHECK-NEXT: v_readlane_b32 s82, v0, 47 +; CHECK-NEXT: v_readlane_b32 s81, v0, 46 +; CHECK-NEXT: v_readlane_b32 s80, v0, 45 +; CHECK-NEXT: v_readlane_b32 s79, v0, 44 +; CHECK-NEXT: v_readlane_b32 s78, v0, 43 +; CHECK-NEXT: v_readlane_b32 s77, v0, 42 +; CHECK-NEXT: v_readlane_b32 s76, v0, 41 +; CHECK-NEXT: v_readlane_b32 s75, v0, 40 +; CHECK-NEXT: v_readlane_b32 s74, v0, 39 +; CHECK-NEXT: v_readlane_b32 s73, v0, 38 +; CHECK-NEXT: v_readlane_b32 s72, v0, 37 +; CHECK-NEXT: v_readlane_b32 s71, v0, 36 +; CHECK-NEXT: v_readlane_b32 s70, v0, 35 +; CHECK-NEXT: v_readlane_b32 s69, v0, 34 +; CHECK-NEXT: v_readlane_b32 s68, v0, 33 +; CHECK-NEXT: v_readlane_b32 s67, v0, 32 +; CHECK-NEXT: v_readlane_b32 s66, v0, 31 +; CHECK-NEXT: v_readlane_b32 s65, v0, 30 +; CHECK-NEXT: v_readlane_b32 s64, v0, 29 +; CHECK-NEXT: v_readlane_b32 s63, v0, 28 +; CHECK-NEXT: v_readlane_b32 s62, v0, 27 +; CHECK-NEXT: v_readlane_b32 s61, v0, 26 +; CHECK-NEXT: v_readlane_b32 s60, v0, 25 +; CHECK-NEXT: v_readlane_b32 s59, v0, 24 +; CHECK-NEXT: v_readlane_b32 s58, v0, 23 +; CHECK-NEXT: v_readlane_b32 s57, v0, 22 +; CHECK-NEXT: v_readlane_b32 s56, v0, 21 +; CHECK-NEXT: v_readlane_b32 s55, v0, 20 +; CHECK-NEXT: v_readlane_b32 s54, v0, 19 +; CHECK-NEXT: v_readlane_b32 s53, v0, 18 +; CHECK-NEXT: v_readlane_b32 s52, v0, 17 +; CHECK-NEXT: v_readlane_b32 s51, v0, 16 +; CHECK-NEXT: v_readlane_b32 s50, v0, 15 +; CHECK-NEXT: v_readlane_b32 s49, v0, 14 +; CHECK-NEXT: v_readlane_b32 s48, v0, 13 +; CHECK-NEXT: v_readlane_b32 s47, v0, 12 +; CHECK-NEXT: v_readlane_b32 s46, v0, 11 +; CHECK-NEXT: v_readlane_b32 s45, v0, 10 +; CHECK-NEXT: v_readlane_b32 s44, v0, 9 +; CHECK-NEXT: v_readlane_b32 s43, v0, 8 +; CHECK-NEXT: v_readlane_b32 s42, v0, 7 +; CHECK-NEXT: v_readlane_b32 s41, v0, 6 +; CHECK-NEXT: v_readlane_b32 s40, v0, 5 +; CHECK-NEXT: v_readlane_b32 s39, v0, 4 +; CHECK-NEXT: v_readlane_b32 s38, v0, 3 +; CHECK-NEXT: v_readlane_b32 s37, v0, 2 +; CHECK-NEXT: v_readlane_b32 s36, v0, 1 +; CHECK-NEXT: v_readlane_b32 s35, v0, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35},~{s36},~{s37},~{s38},~{s39},~{s40},~{s41},~{s42} ,~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49},~{s50} @@ -21,13 +166,161 @@ define void @spill_more_than_wavesize_csr_sgprs() { ret void } -; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object: -; CHECK-DAG: v_writelane_b32 v1, s98, 63 -; CHECK-DAG: v_writelane_b32 v2, s99, 0 -; CHECK-NOT: dummy -; CHECK-DAG: v_readlane_b32 s99, v2, 0 -; CHECK-DAG: v_readlane_b32 s98, v1, 63 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() { +; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs_with_stack_object: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: v_writelane_b32 v1, s35, 0 +; CHECK-NEXT: v_writelane_b32 v1, s36, 1 +; CHECK-NEXT: v_writelane_b32 v1, s37, 2 +; CHECK-NEXT: v_writelane_b32 v1, s38, 3 +; CHECK-NEXT: v_writelane_b32 v1, s39, 4 +; CHECK-NEXT: v_writelane_b32 v1, s40, 5 +; CHECK-NEXT: v_writelane_b32 v1, s41, 6 +; CHECK-NEXT: v_writelane_b32 v1, s42, 7 +; CHECK-NEXT: v_writelane_b32 v1, s43, 8 +; CHECK-NEXT: v_writelane_b32 v1, s44, 9 +; CHECK-NEXT: v_writelane_b32 v1, s45, 10 +; CHECK-NEXT: v_writelane_b32 v1, s46, 11 +; CHECK-NEXT: v_writelane_b32 v1, s47, 12 +; CHECK-NEXT: v_writelane_b32 v1, s48, 13 +; CHECK-NEXT: v_writelane_b32 v1, s49, 14 +; CHECK-NEXT: v_writelane_b32 v1, s50, 15 +; CHECK-NEXT: v_writelane_b32 v1, s51, 16 +; CHECK-NEXT: v_writelane_b32 v1, s52, 17 +; CHECK-NEXT: v_writelane_b32 v1, s53, 18 +; CHECK-NEXT: v_writelane_b32 v1, s54, 19 +; CHECK-NEXT: v_writelane_b32 v1, s55, 20 +; CHECK-NEXT: v_writelane_b32 v1, s56, 21 +; CHECK-NEXT: v_writelane_b32 v1, s57, 22 +; CHECK-NEXT: v_writelane_b32 v1, s58, 23 +; CHECK-NEXT: v_writelane_b32 v1, s59, 24 +; CHECK-NEXT: v_writelane_b32 v1, s60, 25 +; CHECK-NEXT: v_writelane_b32 v1, s61, 26 +; CHECK-NEXT: v_writelane_b32 v1, s62, 27 +; CHECK-NEXT: v_writelane_b32 v1, s63, 28 +; CHECK-NEXT: v_writelane_b32 v1, s64, 29 +; CHECK-NEXT: v_writelane_b32 v1, s65, 30 +; CHECK-NEXT: v_writelane_b32 v1, s66, 31 +; CHECK-NEXT: v_writelane_b32 v1, s67, 32 +; CHECK-NEXT: v_writelane_b32 v1, s68, 33 +; CHECK-NEXT: v_writelane_b32 v1, s69, 34 +; CHECK-NEXT: v_writelane_b32 v1, s70, 35 +; CHECK-NEXT: v_writelane_b32 v1, s71, 36 +; CHECK-NEXT: v_writelane_b32 v1, s72, 37 +; CHECK-NEXT: v_writelane_b32 v1, s73, 38 +; CHECK-NEXT: v_writelane_b32 v1, s74, 39 +; CHECK-NEXT: v_writelane_b32 v1, s75, 40 +; CHECK-NEXT: v_writelane_b32 v1, s76, 41 +; CHECK-NEXT: v_writelane_b32 v1, s77, 42 +; CHECK-NEXT: v_writelane_b32 v1, s78, 43 +; CHECK-NEXT: v_writelane_b32 v1, s79, 44 +; CHECK-NEXT: v_writelane_b32 v1, s80, 45 +; CHECK-NEXT: v_writelane_b32 v1, s81, 46 +; CHECK-NEXT: v_writelane_b32 v1, s82, 47 +; CHECK-NEXT: v_writelane_b32 v1, s83, 48 +; CHECK-NEXT: v_writelane_b32 v1, s84, 49 +; CHECK-NEXT: v_writelane_b32 v1, s85, 50 +; CHECK-NEXT: v_writelane_b32 v1, s86, 51 +; CHECK-NEXT: v_writelane_b32 v1, s87, 52 +; CHECK-NEXT: v_writelane_b32 v1, s88, 53 +; CHECK-NEXT: v_writelane_b32 v1, s89, 54 +; CHECK-NEXT: v_writelane_b32 v1, s90, 55 +; CHECK-NEXT: v_writelane_b32 v1, s91, 56 +; CHECK-NEXT: v_writelane_b32 v1, s92, 57 +; CHECK-NEXT: v_writelane_b32 v1, s93, 58 +; CHECK-NEXT: v_writelane_b32 v1, s94, 59 +; CHECK-NEXT: v_writelane_b32 v1, s95, 60 +; CHECK-NEXT: v_writelane_b32 v2, s99, 0 +; CHECK-NEXT: v_writelane_b32 v1, s96, 61 +; CHECK-NEXT: v_writelane_b32 v2, s100, 1 +; CHECK-NEXT: v_writelane_b32 v1, s97, 62 +; CHECK-NEXT: v_writelane_b32 v2, s101, 2 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_writelane_b32 v1, s98, 63 +; CHECK-NEXT: v_writelane_b32 v2, s102, 3 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: v_readlane_b32 s102, v2, 3 +; CHECK-NEXT: v_readlane_b32 s101, v2, 2 +; CHECK-NEXT: v_readlane_b32 s100, v2, 1 +; CHECK-NEXT: v_readlane_b32 s99, v2, 0 +; CHECK-NEXT: v_readlane_b32 s98, v1, 63 +; CHECK-NEXT: v_readlane_b32 s97, v1, 62 +; CHECK-NEXT: v_readlane_b32 s96, v1, 61 +; CHECK-NEXT: v_readlane_b32 s95, v1, 60 +; CHECK-NEXT: v_readlane_b32 s94, v1, 59 +; CHECK-NEXT: v_readlane_b32 s93, v1, 58 +; CHECK-NEXT: v_readlane_b32 s92, v1, 57 +; CHECK-NEXT: v_readlane_b32 s91, v1, 56 +; CHECK-NEXT: v_readlane_b32 s90, v1, 55 +; CHECK-NEXT: v_readlane_b32 s89, v1, 54 +; CHECK-NEXT: v_readlane_b32 s88, v1, 53 +; CHECK-NEXT: v_readlane_b32 s87, v1, 52 +; CHECK-NEXT: v_readlane_b32 s86, v1, 51 +; CHECK-NEXT: v_readlane_b32 s85, v1, 50 +; CHECK-NEXT: v_readlane_b32 s84, v1, 49 +; CHECK-NEXT: v_readlane_b32 s83, v1, 48 +; CHECK-NEXT: v_readlane_b32 s82, v1, 47 +; CHECK-NEXT: v_readlane_b32 s81, v1, 46 +; CHECK-NEXT: v_readlane_b32 s80, v1, 45 +; CHECK-NEXT: v_readlane_b32 s79, v1, 44 +; CHECK-NEXT: v_readlane_b32 s78, v1, 43 +; CHECK-NEXT: v_readlane_b32 s77, v1, 42 +; CHECK-NEXT: v_readlane_b32 s76, v1, 41 +; CHECK-NEXT: v_readlane_b32 s75, v1, 40 +; CHECK-NEXT: v_readlane_b32 s74, v1, 39 +; CHECK-NEXT: v_readlane_b32 s73, v1, 38 +; CHECK-NEXT: v_readlane_b32 s72, v1, 37 +; CHECK-NEXT: v_readlane_b32 s71, v1, 36 +; CHECK-NEXT: v_readlane_b32 s70, v1, 35 +; CHECK-NEXT: v_readlane_b32 s69, v1, 34 +; CHECK-NEXT: v_readlane_b32 s68, v1, 33 +; CHECK-NEXT: v_readlane_b32 s67, v1, 32 +; CHECK-NEXT: v_readlane_b32 s66, v1, 31 +; CHECK-NEXT: v_readlane_b32 s65, v1, 30 +; CHECK-NEXT: v_readlane_b32 s64, v1, 29 +; CHECK-NEXT: v_readlane_b32 s63, v1, 28 +; CHECK-NEXT: v_readlane_b32 s62, v1, 27 +; CHECK-NEXT: v_readlane_b32 s61, v1, 26 +; CHECK-NEXT: v_readlane_b32 s60, v1, 25 +; CHECK-NEXT: v_readlane_b32 s59, v1, 24 +; CHECK-NEXT: v_readlane_b32 s58, v1, 23 +; CHECK-NEXT: v_readlane_b32 s57, v1, 22 +; CHECK-NEXT: v_readlane_b32 s56, v1, 21 +; CHECK-NEXT: v_readlane_b32 s55, v1, 20 +; CHECK-NEXT: v_readlane_b32 s54, v1, 19 +; CHECK-NEXT: v_readlane_b32 s53, v1, 18 +; CHECK-NEXT: v_readlane_b32 s52, v1, 17 +; CHECK-NEXT: v_readlane_b32 s51, v1, 16 +; CHECK-NEXT: v_readlane_b32 s50, v1, 15 +; CHECK-NEXT: v_readlane_b32 s49, v1, 14 +; CHECK-NEXT: v_readlane_b32 s48, v1, 13 +; CHECK-NEXT: v_readlane_b32 s47, v1, 12 +; CHECK-NEXT: v_readlane_b32 s46, v1, 11 +; CHECK-NEXT: v_readlane_b32 s45, v1, 10 +; CHECK-NEXT: v_readlane_b32 s44, v1, 9 +; CHECK-NEXT: v_readlane_b32 s43, v1, 8 +; CHECK-NEXT: v_readlane_b32 s42, v1, 7 +; CHECK-NEXT: v_readlane_b32 s41, v1, 6 +; CHECK-NEXT: v_readlane_b32 s40, v1, 5 +; CHECK-NEXT: v_readlane_b32 s39, v1, 4 +; CHECK-NEXT: v_readlane_b32 s38, v1, 3 +; CHECK-NEXT: v_readlane_b32 s37, v1, 2 +; CHECK-NEXT: v_readlane_b32 s36, v1, 1 +; CHECK-NEXT: v_readlane_b32 s35, v1, 0 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca call void asm sideeffect "", diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index fed60eecc8a8b..0e568e3071e99 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -8,92 +8,168 @@ ; 4 byte emergency stack slot ; = 144 bytes with padding between them -; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0 -; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, s32 -; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[SCALED_IDX]], [[FRAMEDIFF]] - -; GCN-NOT: s32 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN-NOT: s32 - -; GCN: ; ScratchSize: 144 define void @needs_align16_default_stack_align(i32 %idx) #0 { +; GCN-LABEL: needs_align16_default_stack_align: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s32 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 144 %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } -; GCN-LABEL: {{^}}needs_align16_stack_align4: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffffc00 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0x2800{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen - -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 160 define void @needs_align16_stack_align4(i32 %idx) #2 { +; GCN-LABEL: needs_align16_stack_align4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x3c0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffffc00 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x2800 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 160 %alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 16 ret void } -; GCN-LABEL: {{^}}needs_align32: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffff800 - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_addk_i32 s32, 0x3000{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 192 define void @needs_align32(i32 %idx) #0 { +; GCN-LABEL: needs_align32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x7c0 +; GCN-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 4 +; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, 8, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x3000 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 192 %alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5) %gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile <4 x i32> , ptr addrspace(5) %gep0, align 32 ret void } -; GCN-LABEL: {{^}}force_realign4: -; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} -; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffffff00 -; GCN: s_addk_i32 s32, 0xd00{{$}} - -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen -; GCN: s_mov_b32 s32, s34 - -; GCN: ; ScratchSize: 52 define void @force_realign4(i32 %idx) #1 { +; GCN-LABEL: force_realign4: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffff00 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0xd00 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 3 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; ScratchSize: 52 %alloca.align16 = alloca [8 x i32], align 4, addrspace(5) %gep0 = getelementptr inbounds [8 x i32], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx store volatile i32 3, ptr addrspace(5) %gep0, align 4 ret void } -; GCN-LABEL: {{^}}kernel_call_align16_from_8: -; GCN: s_movk_i32 s32, 0x400{{$}} -; GCN-NOT: s32 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_8() #0 { +; GCN-LABEL: kernel_call_align16_from_8: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca = alloca i32, align 4, addrspace(5) store volatile i32 2, ptr addrspace(5) %alloca call void @needs_align16_default_stack_align(i32 1) @@ -101,10 +177,32 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 { } ; The call sequence should keep the stack on call aligned to 4 -; GCN-LABEL: {{^}}kernel_call_align16_from_5: -; GCN: s_movk_i32 s32, 0x400 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_5() { +; GCN-LABEL: kernel_call_align16_from_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca0 = alloca i8, align 1, addrspace(5) store volatile i8 2, ptr addrspace(5) %alloca0 @@ -112,10 +210,32 @@ define amdgpu_kernel void @kernel_call_align16_from_5() { ret void } -; GCN-LABEL: {{^}}kernel_call_align4_from_5: -; GCN: s_movk_i32 s32, 0x400 -; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align4_from_5() { +; GCN-LABEL: kernel_call_align4_from_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_mov_b32 s13, s15 +; GCN-NEXT: s_mov_b32 s12, s14 +; GCN-NEXT: s_getpc_b64 s[14:15] +; GCN-NEXT: s_add_u32 s14, s14, needs_align16_stack_align4@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_stack_align4@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v3, 2 +; GCN-NEXT: v_or_b32_e32 v31, v0, v2 +; GCN-NEXT: s_mov_b32 s14, s16 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] +; GCN-NEXT: s_endpgm %alloca0 = alloca i8, align 1, addrspace(5) store volatile i8 2, ptr addrspace(5) %alloca0 @@ -123,28 +243,36 @@ define amdgpu_kernel void @kernel_call_align4_from_5() { ret void } -; GCN-LABEL: {{^}}default_realign_align128: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_mov_b32 s5, s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_addk_i32 s32, 0x4000 -; GCN-NOT: s33 -; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} -; GCN: s_mov_b32 s32, s34 -; GCN: s_mov_b32 s33, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { +; GCN-LABEL: default_realign_align128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_mov_b32 s5, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_addk_i32 s32, 0x4000 +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s5 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void } -; GCN-LABEL: {{^}}disable_realign_align128: -; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN-NOT: s32 define void @disable_realign_align128(i32 %idx) #3 { +; GCN-LABEL: disable_realign_align128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, 9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, ptr addrspace(5) %alloca.align, align 128 ret void @@ -156,35 +284,48 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 { ; since there is a local object with an alignment of 1024. ; Should use BP to access the incoming stack arguments. ; The BP value is saved/restored with a VGPR spill. - ; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill: -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 -; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 -; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 -; GCN: s_mov_b32 s34, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN-DAG: s_add_i32 s32, s32, 0x30000 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 -; GCN: s_swappc_b64 s[30:31], +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s16, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[18:19] +; GCN-NEXT: v_writelane_b32 v40, s16, 2 +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: v_writelane_b32 v40, s34, 3 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_add_i32 s32, s32, 0x30000 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: v_readlane_b32 s34, v40, 3 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 -; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 -; GCN-NEXT: s_mov_b32 s32, s34 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 -; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] -; GCN: s_setpc_b64 s[30:31] %temp = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %temp, align 1024 call void @extern_func(<32 x i32> %a, i32 %b) @@ -198,23 +339,56 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu ; index variable, the base pointer first get loaded into a VGPR ; and that value should be further referenced to load the incoming values. ; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue. - ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 -; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 -; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34 -; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GCN: s_add_i32 s32, s32, 0x30000 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024 -; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen -; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]] -; GCN: s_mov_b32 s32, s34 -; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]] -; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN: ; %bb.0: ; %begin +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s11, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0xffc0 +; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000 +; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s34 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_add_i32 s32, s32, 0x30000 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1024 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NEXT: s_branch .LBB10_2 +; GCN-NEXT: .LBB10_1: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %loop_body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s10, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_cbranch_execz .LBB10_1 +; GCN-NEXT: ; %bb.3: ; %loop_end +; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1 +; GCN-NEXT: s_add_i32 s10, s10, 1 +; GCN-NEXT: s_cmp_eq_u32 s10, 9 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GCN-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; GCN-NEXT: s_branch .LBB10_1 +; GCN-NEXT: .LBB10_4: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_mov_b32 s34, s14 +; GCN-NEXT: s_mov_b32 s33, s11 +; GCN-NEXT: s_setpc_b64 s[30:31] begin: %local_var = alloca i32, align 1024, addrspace(5) store volatile i32 0, ptr addrspace(5) %local_var, align 1024 @@ -239,16 +413,31 @@ exit: ; preds = %loop_end, %loop_b define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { ; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy: -; GCN: ; %bb.0: -; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 -; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ;;#ASMEND -; GCN: s_setpc_b64 s[30:31] +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 vcc_lo, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v1, s34, 0 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: v_readlane_b32 s34, v1, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s33, vcc_lo +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 ; Use all clobberable registers, so BP has to spill to a VGPR. @@ -262,15 +451,172 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 { define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { ; If there are no free SGPRs or VGPRs available we must spill the BP to memory. - -; GCN-LABEL: no_free_regs_spill_bp_to_mem -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_xor_saveexec_b64 s[6:7], -1 -; GCN: buffer_store_dword v39, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN: buffer_store_dword v0, off, s[0:3], s33 -; GCN: v_mov_b32_e32 v0, s34 -; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33 +; GCN-LABEL: no_free_regs_spill_bp_to_memory: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: v_writelane_b32 v39, s39, 0 +; GCN-NEXT: v_writelane_b32 v39, s40, 1 +; GCN-NEXT: v_writelane_b32 v39, s41, 2 +; GCN-NEXT: v_writelane_b32 v39, s42, 3 +; GCN-NEXT: v_writelane_b32 v39, s43, 4 +; GCN-NEXT: v_writelane_b32 v39, s44, 5 +; GCN-NEXT: v_writelane_b32 v39, s45, 6 +; GCN-NEXT: v_writelane_b32 v39, s46, 7 +; GCN-NEXT: v_writelane_b32 v39, s47, 8 +; GCN-NEXT: v_writelane_b32 v39, s48, 9 +; GCN-NEXT: v_writelane_b32 v39, s49, 10 +; GCN-NEXT: v_writelane_b32 v39, s50, 11 +; GCN-NEXT: v_writelane_b32 v39, s51, 12 +; GCN-NEXT: v_writelane_b32 v39, s52, 13 +; GCN-NEXT: v_writelane_b32 v39, s53, 14 +; GCN-NEXT: v_writelane_b32 v39, s54, 15 +; GCN-NEXT: v_writelane_b32 v39, s55, 16 +; GCN-NEXT: v_writelane_b32 v39, s56, 17 +; GCN-NEXT: v_writelane_b32 v39, s57, 18 +; GCN-NEXT: v_writelane_b32 v39, s58, 19 +; GCN-NEXT: v_writelane_b32 v39, s59, 20 +; GCN-NEXT: v_writelane_b32 v39, s60, 21 +; GCN-NEXT: v_writelane_b32 v39, s61, 22 +; GCN-NEXT: v_writelane_b32 v39, s62, 23 +; GCN-NEXT: v_writelane_b32 v39, s63, 24 +; GCN-NEXT: v_writelane_b32 v39, s64, 25 +; GCN-NEXT: v_writelane_b32 v39, s65, 26 +; GCN-NEXT: v_writelane_b32 v39, s66, 27 +; GCN-NEXT: v_writelane_b32 v39, s67, 28 +; GCN-NEXT: v_writelane_b32 v39, s68, 29 +; GCN-NEXT: v_writelane_b32 v39, s69, 30 +; GCN-NEXT: v_writelane_b32 v39, s70, 31 +; GCN-NEXT: v_writelane_b32 v39, s71, 32 +; GCN-NEXT: v_writelane_b32 v39, s72, 33 +; GCN-NEXT: v_writelane_b32 v39, s73, 34 +; GCN-NEXT: v_writelane_b32 v39, s74, 35 +; GCN-NEXT: v_writelane_b32 v39, s75, 36 +; GCN-NEXT: v_writelane_b32 v39, s76, 37 +; GCN-NEXT: v_writelane_b32 v39, s77, 38 +; GCN-NEXT: v_writelane_b32 v39, s78, 39 +; GCN-NEXT: v_writelane_b32 v39, s79, 40 +; GCN-NEXT: v_writelane_b32 v39, s80, 41 +; GCN-NEXT: v_writelane_b32 v39, s81, 42 +; GCN-NEXT: v_writelane_b32 v39, s82, 43 +; GCN-NEXT: v_writelane_b32 v39, s83, 44 +; GCN-NEXT: v_writelane_b32 v39, s84, 45 +; GCN-NEXT: v_writelane_b32 v39, s85, 46 +; GCN-NEXT: v_writelane_b32 v39, s86, 47 +; GCN-NEXT: v_writelane_b32 v39, s87, 48 +; GCN-NEXT: v_writelane_b32 v39, s88, 49 +; GCN-NEXT: v_writelane_b32 v39, s89, 50 +; GCN-NEXT: v_writelane_b32 v39, s90, 51 +; GCN-NEXT: v_writelane_b32 v39, s91, 52 +; GCN-NEXT: v_writelane_b32 v39, s92, 53 +; GCN-NEXT: v_writelane_b32 v39, s93, 54 +; GCN-NEXT: v_writelane_b32 v39, s94, 55 +; GCN-NEXT: v_writelane_b32 v39, s95, 56 +; GCN-NEXT: v_writelane_b32 v39, s96, 57 +; GCN-NEXT: v_writelane_b32 v39, s97, 58 +; GCN-NEXT: v_writelane_b32 v39, s98, 59 +; GCN-NEXT: v_writelane_b32 v39, s99, 60 +; GCN-NEXT: v_writelane_b32 v39, s100, 61 +; GCN-NEXT: v_writelane_b32 v39, s101, 62 +; GCN-NEXT: v_writelane_b32 v39, s102, 63 +; GCN-NEXT: s_addk_i32 s32, 0x6000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber all VGPRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s102, v39, 63 +; GCN-NEXT: v_readlane_b32 s101, v39, 62 +; GCN-NEXT: v_readlane_b32 s100, v39, 61 +; GCN-NEXT: v_readlane_b32 s99, v39, 60 +; GCN-NEXT: v_readlane_b32 s98, v39, 59 +; GCN-NEXT: v_readlane_b32 s97, v39, 58 +; GCN-NEXT: v_readlane_b32 s96, v39, 57 +; GCN-NEXT: v_readlane_b32 s95, v39, 56 +; GCN-NEXT: v_readlane_b32 s94, v39, 55 +; GCN-NEXT: v_readlane_b32 s93, v39, 54 +; GCN-NEXT: v_readlane_b32 s92, v39, 53 +; GCN-NEXT: v_readlane_b32 s91, v39, 52 +; GCN-NEXT: v_readlane_b32 s90, v39, 51 +; GCN-NEXT: v_readlane_b32 s89, v39, 50 +; GCN-NEXT: v_readlane_b32 s88, v39, 49 +; GCN-NEXT: v_readlane_b32 s87, v39, 48 +; GCN-NEXT: v_readlane_b32 s86, v39, 47 +; GCN-NEXT: v_readlane_b32 s85, v39, 46 +; GCN-NEXT: v_readlane_b32 s84, v39, 45 +; GCN-NEXT: v_readlane_b32 s83, v39, 44 +; GCN-NEXT: v_readlane_b32 s82, v39, 43 +; GCN-NEXT: v_readlane_b32 s81, v39, 42 +; GCN-NEXT: v_readlane_b32 s80, v39, 41 +; GCN-NEXT: v_readlane_b32 s79, v39, 40 +; GCN-NEXT: v_readlane_b32 s78, v39, 39 +; GCN-NEXT: v_readlane_b32 s77, v39, 38 +; GCN-NEXT: v_readlane_b32 s76, v39, 37 +; GCN-NEXT: v_readlane_b32 s75, v39, 36 +; GCN-NEXT: v_readlane_b32 s74, v39, 35 +; GCN-NEXT: v_readlane_b32 s73, v39, 34 +; GCN-NEXT: v_readlane_b32 s72, v39, 33 +; GCN-NEXT: v_readlane_b32 s71, v39, 32 +; GCN-NEXT: v_readlane_b32 s70, v39, 31 +; GCN-NEXT: v_readlane_b32 s69, v39, 30 +; GCN-NEXT: v_readlane_b32 s68, v39, 29 +; GCN-NEXT: v_readlane_b32 s67, v39, 28 +; GCN-NEXT: v_readlane_b32 s66, v39, 27 +; GCN-NEXT: v_readlane_b32 s65, v39, 26 +; GCN-NEXT: v_readlane_b32 s64, v39, 25 +; GCN-NEXT: v_readlane_b32 s63, v39, 24 +; GCN-NEXT: v_readlane_b32 s62, v39, 23 +; GCN-NEXT: v_readlane_b32 s61, v39, 22 +; GCN-NEXT: v_readlane_b32 s60, v39, 21 +; GCN-NEXT: v_readlane_b32 s59, v39, 20 +; GCN-NEXT: v_readlane_b32 s58, v39, 19 +; GCN-NEXT: v_readlane_b32 s57, v39, 18 +; GCN-NEXT: v_readlane_b32 s56, v39, 17 +; GCN-NEXT: v_readlane_b32 s55, v39, 16 +; GCN-NEXT: v_readlane_b32 s54, v39, 15 +; GCN-NEXT: v_readlane_b32 s53, v39, 14 +; GCN-NEXT: v_readlane_b32 s52, v39, 13 +; GCN-NEXT: v_readlane_b32 s51, v39, 12 +; GCN-NEXT: v_readlane_b32 s50, v39, 11 +; GCN-NEXT: v_readlane_b32 s49, v39, 10 +; GCN-NEXT: v_readlane_b32 s48, v39, 9 +; GCN-NEXT: v_readlane_b32 s47, v39, 8 +; GCN-NEXT: v_readlane_b32 s46, v39, 7 +; GCN-NEXT: v_readlane_b32 s45, v39, 6 +; GCN-NEXT: v_readlane_b32 s44, v39, 5 +; GCN-NEXT: v_readlane_b32 s43, v39, 4 +; GCN-NEXT: v_readlane_b32 s42, v39, 3 +; GCN-NEXT: v_readlane_b32 s41, v39, 2 +; GCN-NEXT: v_readlane_b32 s40, v39, 1 +; GCN-NEXT: v_readlane_b32 s39, v39, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 @@ -297,22 +643,179 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 { define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i32 %b, ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #5 { ; If the size of the offset exceeds the MUBUF offset field we need another ; scratch VGPR to hold the offset. - -; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset -; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-NEXT: s_add_i32 s5, s33, 0x42100 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] -; GCN-NEXT: s_add_i32 s5, s33, 0x42200 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NEXT: s_add_i32 s5, s33, 0x42300 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0 +; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_add_i32 s5, s33, 0x42200 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NEXT: s_add_i32 s5, s33, 0x42300 +; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4 +; GCN-NEXT: v_writelane_b32 v39, s39, 0 +; GCN-NEXT: v_writelane_b32 v39, s40, 1 +; GCN-NEXT: v_writelane_b32 v39, s41, 2 +; GCN-NEXT: v_writelane_b32 v39, s42, 3 +; GCN-NEXT: v_writelane_b32 v39, s43, 4 +; GCN-NEXT: v_writelane_b32 v39, s44, 5 +; GCN-NEXT: v_writelane_b32 v39, s45, 6 +; GCN-NEXT: v_writelane_b32 v39, s46, 7 +; GCN-NEXT: v_writelane_b32 v39, s47, 8 +; GCN-NEXT: v_writelane_b32 v39, s48, 9 +; GCN-NEXT: v_writelane_b32 v39, s49, 10 +; GCN-NEXT: v_writelane_b32 v39, s50, 11 +; GCN-NEXT: v_writelane_b32 v39, s51, 12 +; GCN-NEXT: v_writelane_b32 v39, s52, 13 +; GCN-NEXT: v_writelane_b32 v39, s53, 14 +; GCN-NEXT: v_writelane_b32 v39, s54, 15 +; GCN-NEXT: v_writelane_b32 v39, s55, 16 +; GCN-NEXT: v_writelane_b32 v39, s56, 17 +; GCN-NEXT: v_writelane_b32 v39, s57, 18 +; GCN-NEXT: v_writelane_b32 v39, s58, 19 +; GCN-NEXT: v_writelane_b32 v39, s59, 20 +; GCN-NEXT: v_writelane_b32 v39, s60, 21 +; GCN-NEXT: v_writelane_b32 v39, s61, 22 +; GCN-NEXT: v_writelane_b32 v39, s62, 23 +; GCN-NEXT: v_writelane_b32 v39, s63, 24 +; GCN-NEXT: v_writelane_b32 v39, s64, 25 +; GCN-NEXT: v_writelane_b32 v39, s65, 26 +; GCN-NEXT: v_writelane_b32 v39, s66, 27 +; GCN-NEXT: v_writelane_b32 v39, s67, 28 +; GCN-NEXT: v_writelane_b32 v39, s68, 29 +; GCN-NEXT: v_writelane_b32 v39, s69, 30 +; GCN-NEXT: v_writelane_b32 v39, s70, 31 +; GCN-NEXT: v_writelane_b32 v39, s71, 32 +; GCN-NEXT: v_writelane_b32 v39, s72, 33 +; GCN-NEXT: v_writelane_b32 v39, s73, 34 +; GCN-NEXT: v_writelane_b32 v39, s74, 35 +; GCN-NEXT: v_writelane_b32 v39, s75, 36 +; GCN-NEXT: v_writelane_b32 v39, s76, 37 +; GCN-NEXT: v_writelane_b32 v39, s77, 38 +; GCN-NEXT: v_writelane_b32 v39, s78, 39 +; GCN-NEXT: v_writelane_b32 v39, s79, 40 +; GCN-NEXT: v_writelane_b32 v39, s80, 41 +; GCN-NEXT: v_writelane_b32 v39, s81, 42 +; GCN-NEXT: v_writelane_b32 v39, s82, 43 +; GCN-NEXT: v_writelane_b32 v39, s83, 44 +; GCN-NEXT: v_writelane_b32 v39, s84, 45 +; GCN-NEXT: v_writelane_b32 v39, s85, 46 +; GCN-NEXT: v_writelane_b32 v39, s86, 47 +; GCN-NEXT: v_writelane_b32 v39, s87, 48 +; GCN-NEXT: v_writelane_b32 v39, s88, 49 +; GCN-NEXT: v_writelane_b32 v39, s89, 50 +; GCN-NEXT: v_writelane_b32 v39, s90, 51 +; GCN-NEXT: v_writelane_b32 v39, s91, 52 +; GCN-NEXT: v_writelane_b32 v39, s92, 53 +; GCN-NEXT: v_writelane_b32 v39, s93, 54 +; GCN-NEXT: v_writelane_b32 v39, s94, 55 +; GCN-NEXT: v_writelane_b32 v39, s95, 56 +; GCN-NEXT: v_writelane_b32 v39, s96, 57 +; GCN-NEXT: v_writelane_b32 v39, s97, 58 +; GCN-NEXT: v_writelane_b32 v39, s98, 59 +; GCN-NEXT: v_writelane_b32 v39, s99, 60 +; GCN-NEXT: v_writelane_b32 v39, s100, 61 +; GCN-NEXT: v_writelane_b32 v39, s101, 62 +; GCN-NEXT: v_mov_b32_e32 v1, 0x1080 +; GCN-NEXT: v_writelane_b32 v39, s102, 63 +; GCN-NEXT: s_add_i32 s32, s32, 0x46000 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; clobber all VGPRs +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_add_i32 s5, s33, 0x42200 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_add_i32 s5, s33, 0x42300 +; GCN-NEXT: v_readlane_b32 s102, v39, 63 +; GCN-NEXT: v_readlane_b32 s101, v39, 62 +; GCN-NEXT: v_readlane_b32 s100, v39, 61 +; GCN-NEXT: v_readlane_b32 s99, v39, 60 +; GCN-NEXT: v_readlane_b32 s98, v39, 59 +; GCN-NEXT: v_readlane_b32 s97, v39, 58 +; GCN-NEXT: v_readlane_b32 s96, v39, 57 +; GCN-NEXT: v_readlane_b32 s95, v39, 56 +; GCN-NEXT: v_readlane_b32 s94, v39, 55 +; GCN-NEXT: v_readlane_b32 s93, v39, 54 +; GCN-NEXT: v_readlane_b32 s92, v39, 53 +; GCN-NEXT: v_readlane_b32 s91, v39, 52 +; GCN-NEXT: v_readlane_b32 s90, v39, 51 +; GCN-NEXT: v_readlane_b32 s89, v39, 50 +; GCN-NEXT: v_readlane_b32 s88, v39, 49 +; GCN-NEXT: v_readlane_b32 s87, v39, 48 +; GCN-NEXT: v_readlane_b32 s86, v39, 47 +; GCN-NEXT: v_readlane_b32 s85, v39, 46 +; GCN-NEXT: v_readlane_b32 s84, v39, 45 +; GCN-NEXT: v_readlane_b32 s83, v39, 44 +; GCN-NEXT: v_readlane_b32 s82, v39, 43 +; GCN-NEXT: v_readlane_b32 s81, v39, 42 +; GCN-NEXT: v_readlane_b32 s80, v39, 41 +; GCN-NEXT: v_readlane_b32 s79, v39, 40 +; GCN-NEXT: v_readlane_b32 s78, v39, 39 +; GCN-NEXT: v_readlane_b32 s77, v39, 38 +; GCN-NEXT: v_readlane_b32 s76, v39, 37 +; GCN-NEXT: v_readlane_b32 s75, v39, 36 +; GCN-NEXT: v_readlane_b32 s74, v39, 35 +; GCN-NEXT: v_readlane_b32 s73, v39, 34 +; GCN-NEXT: v_readlane_b32 s72, v39, 33 +; GCN-NEXT: v_readlane_b32 s71, v39, 32 +; GCN-NEXT: v_readlane_b32 s70, v39, 31 +; GCN-NEXT: v_readlane_b32 s69, v39, 30 +; GCN-NEXT: v_readlane_b32 s68, v39, 29 +; GCN-NEXT: v_readlane_b32 s67, v39, 28 +; GCN-NEXT: v_readlane_b32 s66, v39, 27 +; GCN-NEXT: v_readlane_b32 s65, v39, 26 +; GCN-NEXT: v_readlane_b32 s64, v39, 25 +; GCN-NEXT: v_readlane_b32 s63, v39, 24 +; GCN-NEXT: v_readlane_b32 s62, v39, 23 +; GCN-NEXT: v_readlane_b32 s61, v39, 22 +; GCN-NEXT: v_readlane_b32 s60, v39, 21 +; GCN-NEXT: v_readlane_b32 s59, v39, 20 +; GCN-NEXT: v_readlane_b32 s58, v39, 19 +; GCN-NEXT: v_readlane_b32 s57, v39, 18 +; GCN-NEXT: v_readlane_b32 s56, v39, 17 +; GCN-NEXT: v_readlane_b32 s55, v39, 16 +; GCN-NEXT: v_readlane_b32 s54, v39, 15 +; GCN-NEXT: v_readlane_b32 s53, v39, 14 +; GCN-NEXT: v_readlane_b32 s52, v39, 13 +; GCN-NEXT: v_readlane_b32 s51, v39, 12 +; GCN-NEXT: v_readlane_b32 s50, v39, 11 +; GCN-NEXT: v_readlane_b32 s49, v39, 10 +; GCN-NEXT: v_readlane_b32 s48, v39, 9 +; GCN-NEXT: v_readlane_b32 s47, v39, 8 +; GCN-NEXT: v_readlane_b32 s46, v39, 7 +; GCN-NEXT: v_readlane_b32 s45, v39, 6 +; GCN-NEXT: v_readlane_b32 s44, v39, 5 +; GCN-NEXT: v_readlane_b32 s43, v39, 4 +; GCN-NEXT: v_readlane_b32 s42, v39, 3 +; GCN-NEXT: v_readlane_b32 s41, v39, 2 +; GCN-NEXT: v_readlane_b32 s40, v39, 1 +; GCN-NEXT: v_readlane_b32 s39, v39, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s34, v0 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 +; GCN-NEXT: s_add_i32 s5, s33, 0x42100 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, ptr addrspace(5) %local_val, align 128 diff --git a/llvm/test/CodeGen/Hexagon/iss127296.ll b/llvm/test/CodeGen/Hexagon/iss127296.ll new file mode 100644 index 0000000000000..bf0e7a9881014 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/iss127296.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=hexagon -O0 < %s | FileCheck %s + +; CHECK: r0 = add(r0,#-1) + +define fastcc void @os.linux.tls.initStatic(i32 %x) { + %1 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %x, i32 1) + br label %2 + + 2: ; preds = %0 + %3 = extractvalue { i32, i1 } %1, 0 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + diff --git a/llvm/test/CodeGen/Mips/readcyclecounter.ll b/llvm/test/CodeGen/Mips/readcyclecounter.ll new file mode 100644 index 0000000000000..23d3ea014f091 --- /dev/null +++ b/llvm/test/CodeGen/Mips/readcyclecounter.ll @@ -0,0 +1,47 @@ +;RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips32r2 < %s | FileCheck %s --check-prefix=MIPSEL +;RUN: llc -mtriple=mips64el-linux-gnuabi64 -mcpu=mips64r2 < %s | FileCheck %s --check-prefix=MIPS64EL +;RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips2 < %s | FileCheck %s --check-prefix=MIPSEL +;RUN: llc -mtriple=mips64el-linux-gnuabi64 -mcpu=mips3 < %s | FileCheck %s --check-prefix=MIPS64EL +;RUN: llc -mtriple=mipsel -mcpu=mips32r2 < %s | FileCheck %s --check-prefix=MIPSEL +;RUN: llc -mtriple=mips64el -mcpu=mips64r2 < %s | FileCheck %s --check-prefix=MIPS64EL +;RUN: llc -mtriple=mipsel -mcpu=mips2 < %s | FileCheck %s --check-prefix=MIPSEL_NOT_SUPPORTED +;RUN: llc -mtriple=mips64el -mcpu=mips3 < %s | FileCheck %s --check-prefix=MIPS64EL_NOT_SUPPORTED + +; XFAIL: expensive_checks + +declare i64 @llvm.readcyclecounter() nounwind readnone + +define i64 @test_readcyclecounter() nounwind { +; MIPSEL-LABEL: test_readcyclecounter: +; MIPSEL: # %bb.0: # %entry +; MIPSEL-NEXT: .set push +; MIPSEL-NEXT: .set mips32r2 +; MIPSEL-NEXT: rdhwr $2, $hwr_cc +; MIPSEL-NEXT: .set pop +; MIPSEL-NEXT: jr $ra +; MIPSEL-NEXT: addiu $3, $zero, 0 +; +; MIPSEL_NOT_SUPPORTED-LABEL: test_readcyclecounter: +; MIPSEL_NOT_SUPPORTED: # %bb.0: # %entry +; MIPSEL_NOT_SUPPORTED-NEXT: addiu $2, $zero, 0 +; MIPSEL_NOT_SUPPORTED-NEXT: jr $ra +; MIPSEL_NOT_SUPPORTED-NEXT: addiu $3, $zero, 0 +; +; MIPS64EL-LABEL: test_readcyclecounter: +; MIPS64EL: # %bb.0: # %entry +; MIPS64EL-NEXT: .set push +; MIPS64EL-NEXT: .set mips32r2 +; MIPS64EL-NEXT: rdhwr $2, $hwr_cc +; MIPS64EL-NEXT: .set pop +; MIPS64EL-NEXT: jr $ra +; MIPS64EL-NEXT: nop +; +; MIPS64EL_NOT_SUPPORTED-LABEL: test_readcyclecounter: +; MIPS64EL_NOT_SUPPORTED: # %bb.0: # %entry +; MIPS64EL_NOT_SUPPORTED-NEXT: jr $ra +; MIPS64EL_NOT_SUPPORTED-NEXT: daddiu $2, $zero, 0 +entry: + %tmp0 = tail call i64 @llvm.readcyclecounter() + ret i64 %tmp0 +} + diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index eb7be14abe431..0d1d75c1b2a75 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -894,18 +894,18 @@ define <2 x i16> @vwmul_v2i16_multiuse(ptr %x, ptr %y, ptr %z, ptr %w) { ; CHECK-LABEL: vwmul_v2i16_multiuse: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vle8.v v10, (a2) -; CHECK-NEXT: vle8.v v11, (a3) -; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: vsext.vf2 v10, v8 ; CHECK-NEXT: vsext.vf2 v8, v9 -; CHECK-NEXT: vsext.vf2 v9, v10 -; CHECK-NEXT: vsext.vf2 v10, v11 -; CHECK-NEXT: vmul.vv v11, v12, v10 -; CHECK-NEXT: vmul.vv v10, v8, v10 -; CHECK-NEXT: vdivu.vv v8, v8, v9 -; CHECK-NEXT: vor.vv v9, v11, v10 +; CHECK-NEXT: vdivu.vv v8, v10, v8 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v11, (a3) +; CHECK-NEXT: vsext.vf2 v12, v9 +; CHECK-NEXT: vsext.vf2 v9, v11 +; CHECK-NEXT: vmul.vv v11, v12, v9 +; CHECK-NEXT: vmul.vv v9, v10, v9 +; CHECK-NEXT: vor.vv v9, v11, v9 ; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll index 1948675ae9cf0..c46334fe556eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -1564,8 +1564,8 @@ define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: .LBB27_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfdiv.vf v8, v8, fa0 +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB27_3 @@ -1654,8 +1654,8 @@ define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) { ; CHECK-NEXT: .LBB28_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl1re32.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vfrdiv.vf v8, v8, fa0 +; CHECK-NEXT: sub a6, a6, a3 ; CHECK-NEXT: vs1r.v v8, (a5) ; CHECK-NEXT: add a5, a5, a1 ; CHECK-NEXT: bnez a6, .LBB28_3 @@ -2504,8 +2504,8 @@ define void @sink_splat_udiv_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB42_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vdivu.vx v8, v8, a1 +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vs2r.v v8, (a6) ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB42_3 @@ -2595,8 +2595,8 @@ define void @sink_splat_sdiv_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB43_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vdiv.vx v8, v8, a1 +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vs2r.v v8, (a6) ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB43_3 @@ -2686,8 +2686,8 @@ define void @sink_splat_urem_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB44_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vremu.vx v8, v8, a1 +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vs2r.v v8, (a6) ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB44_3 @@ -2777,8 +2777,8 @@ define void @sink_splat_srem_scalable(ptr nocapture %a, i32 signext %x) { ; CHECK-NEXT: .LBB45_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vl2re32.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vrem.vx v8, v8, a1 +; CHECK-NEXT: sub a7, a7, a3 ; CHECK-NEXT: vs2r.v v8, (a6) ; CHECK-NEXT: add a6, a6, a5 ; CHECK-NEXT: bnez a7, .LBB45_3 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll index 07750623dd44b..217a02d08dead 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll @@ -221,16 +221,16 @@ define @vfdiv_vv_nxv32bf16( %va, @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: vmv.v.x v16, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v0, v8, v0 +; CHECK-NEXT: vfdiv.vv v24, v16, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v24, v16 +; CHECK-NEXT: vfdiv.vv v16, v0, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -573,16 +583,16 @@ define @vfdiv_vv_nxv32f16( %va, @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: vmv.v.x v16, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v0, v8, v0 +; ZVFHMIN-NEXT: vfdiv.vv v24, v16, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll index e671ba850415b..9aba6455f0fac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll @@ -200,16 +200,16 @@ define @vfdiv_vv_nxv32bf16( %va, @vfdiv_vv_nxv32bf16( %va, @vfdiv_vf_nxv32bf16( %va, bfloat %b) { ; CHECK-LABEL: vfdiv_vf_nxv32bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v0, v8, v0 +; CHECK-NEXT: vfdiv.vv v16, v16, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0 +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v24, v16 +; CHECK-NEXT: vfdiv.vv v24, v24, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %head = insertelement poison, bfloat %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer @@ -528,16 +512,16 @@ define @vfdiv_vv_nxv32f16( %va, @vfdiv_vf_nxv32f16( %va, half %b ; ; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16: ; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: addi sp, sp, -16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: addi a1, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v0, v8, v0 +; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16 +; ZVFHMIN-NEXT: vfdiv.vv v24, v24, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 -; ZVFHMIN-NEXT: addi sp, sp, 16 -; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %head = insertelement poison, half %b, i32 0 %splat = shufflevector %head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll index d5e65e2c8fd3f..eeb5f3bc984d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll @@ -92,15 +92,15 @@ define @vfsqrt_nxv32bf16( %v) stric ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfsqrt.v v16, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsqrt.v v16, v24 +; CHECK-NEXT: vfsqrt.v v24, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %r = call @llvm.experimental.constrained.sqrt.nxv32bf16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r @@ -229,15 +229,15 @@ define @vfsqrt_nxv32f16( %v) strictfp { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v16, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsqrt.v v16, v24 +; ZVFHMIN-NEXT: vfsqrt.v v24, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %r = call @llvm.experimental.constrained.sqrt.nxv32f16( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll index 4d761981aac97..6d7662db2b157 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll @@ -87,15 +87,15 @@ define @vfsqrt_nxv32bf16( %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfsqrt.v v16, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfsqrt.v v16, v24 +; CHECK-NEXT: vfsqrt.v v24, v24 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24 ; CHECK-NEXT: ret %r = call @llvm.sqrt.nxv32bf16( %v) ret %r @@ -224,15 +224,15 @@ define @vfsqrt_nxv32f16( %v) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v16, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfsqrt.v v16, v24 +; ZVFHMIN-NEXT: vfsqrt.v v24, v24 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24 ; ZVFHMIN-NEXT: ret %r = call @llvm.sqrt.nxv32f16( %v) ret %r diff --git a/llvm/test/CodeGen/SPIRV/builtin_vars-decorate.ll b/llvm/test/CodeGen/SPIRV/builtin_vars-decorate.ll index 59abd5dbee6a0..0c9b29de890d4 100644 --- a/llvm/test/CodeGen/SPIRV/builtin_vars-decorate.ll +++ b/llvm/test/CodeGen/SPIRV/builtin_vars-decorate.ll @@ -1,22 +1,23 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s - -; CHECK: OpName %[[#WD:]] "__spirv_BuiltInWorkDim" -; CHECK: OpName %[[#GS:]] "__spirv_BuiltInGlobalSize" -; CHECK: OpName %[[#GII:]] "__spirv_BuiltInGlobalInvocationId" -; CHECK: OpName %[[#WS:]] "__spirv_BuiltInWorkgroupSize" -; CHECK: OpName %[[#EWS:]] "__spirv_BuiltInEnqueuedWorkgroupSize" -; CHECK: OpName %[[#LLI:]] "__spirv_BuiltInLocalInvocationId" -; CHECK: OpName %[[#NW:]] "__spirv_BuiltInNumWorkgroups" -; CHECK: OpName %[[#WI:]] "__spirv_BuiltInWorkgroupId" -; CHECK: OpName %[[#GO:]] "__spirv_BuiltInGlobalOffset" -; CHECK: OpName %[[#GLI:]] "__spirv_BuiltInGlobalLinearId" -; CHECK: OpName %[[#LLII:]] "__spirv_BuiltInLocalInvocationIndex" -; CHECK: OpName %[[#SS:]] "__spirv_BuiltInSubgroupSize" -; CHECK: OpName %[[#SMS:]] "__spirv_BuiltInSubgroupMaxSize" -; CHECK: OpName %[[#NS:]] "__spirv_BuiltInNumSubgroups" -; CHECK: OpName %[[#NES:]] "__spirv_BuiltInNumEnqueuedSubgroups" -; CHECK: OpName %[[#SI:]] "__spirv_BuiltInSubgroupId" -; CHECK: OpName %[[#SLII:]] "__spirv_BuiltInSubgroupLocalInvocationId" +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: OpName %[[#WD:]] "__spirv_BuiltInWorkDim" +; CHECK-DAG: OpName %[[#GS:]] "__spirv_BuiltInGlobalSize" +; CHECK-DAG: OpName %[[#GII:]] "__spirv_BuiltInGlobalInvocationId" +; CHECK-DAG: OpName %[[#WS:]] "__spirv_BuiltInWorkgroupSize" +; CHECK-DAG: OpName %[[#EWS:]] "__spirv_BuiltInEnqueuedWorkgroupSize" +; CHECK-DAG: OpName %[[#LLI:]] "__spirv_BuiltInLocalInvocationId" +; CHECK-DAG: OpName %[[#NW:]] "__spirv_BuiltInNumWorkgroups" +; CHECK-DAG: OpName %[[#WI:]] "__spirv_BuiltInWorkgroupId" +; CHECK-DAG: OpName %[[#GO:]] "__spirv_BuiltInGlobalOffset" +; CHECK-DAG: OpName %[[#GLI:]] "__spirv_BuiltInGlobalLinearId" +; CHECK-DAG: OpName %[[#LLII:]] "__spirv_BuiltInLocalInvocationIndex" +; CHECK-DAG: OpName %[[#SS:]] "__spirv_BuiltInSubgroupSize" +; CHECK-DAG: OpName %[[#SMS:]] "__spirv_BuiltInSubgroupMaxSize" +; CHECK-DAG: OpName %[[#NS:]] "__spirv_BuiltInNumSubgroups" +; CHECK-DAG: OpName %[[#NES:]] "__spirv_BuiltInNumEnqueuedSubgroups" +; CHECK-DAG: OpName %[[#SI:]] "__spirv_BuiltInSubgroupId" +; CHECK-DAG: OpName %[[#SLII:]] "__spirv_BuiltInSubgroupLocalInvocationId" ; CHECK-DAG: OpDecorate %[[#NW]] BuiltIn NumWorkgroups ; CHECK-DAG: OpDecorate %[[#WS]] BuiltIn WorkgroupSize @@ -35,6 +36,33 @@ ; CHECK-DAG: OpDecorate %[[#NES]] BuiltIn NumEnqueuedSubgroups ; CHECK-DAG: OpDecorate %[[#SI]] BuiltIn SubgroupId ; CHECK-DAG: OpDecorate %[[#SLII]] BuiltIn SubgroupLocalInvocationId + +; CHECK-DAG: %[[#SizeT:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#Int32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#SizeTPtr:]] = OpTypePointer Input %[[#SizeT]] +; CHECK-DAG: %[[#Int32Ptr:]] = OpTypePointer Input %[[#Int32]] + +; CHECK-DAG: %[[#GLI]] = OpVariable %[[#SizeTPtr]] Input +; CHECK-DAG: %[[#LLII]] = OpVariable %[[#SizeTPtr]] Input +; CHECK-DAG: %[[#WD]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#SS]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#SMS]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#NS]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#NES]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#SI]] = OpVariable %[[#Int32Ptr]] Input +; CHECK-DAG: %[[#SLII]] = OpVariable %[[#Int32Ptr]] Input + +; CHECK: OpFunction +; CHECK: %[[#]] = OpLoad %[[#SizeT]] %[[#GLI]] +; CHECK: %[[#]] = OpLoad %[[#SizeT]] %[[#LLII]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#WD]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#SS]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#SMS]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#NS]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#NES]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#SI]] +; CHECK: %[[#]] = OpLoad %[[#Int32]] %[[#SLII]] + @__spirv_BuiltInWorkDim = external addrspace(1) global i32 @__spirv_BuiltInGlobalSize = external addrspace(1) global <3 x i32> @__spirv_BuiltInGlobalInvocationId = external addrspace(1) global <3 x i32> @@ -55,5 +83,24 @@ define spir_kernel void @_Z1wv() { entry: + %r1 = tail call spir_func i64 @get_global_linear_id() + %r2 = tail call spir_func i64 @get_local_linear_id() + %r3 = tail call spir_func i32 @get_work_dim() + %r4 = tail call spir_func i32 @get_sub_group_size() + %r5 = tail call spir_func i32 @get_max_sub_group_size() + %r6 = tail call spir_func i32 @get_num_sub_groups() + %r7 = tail call spir_func i32 @get_enqueued_num_sub_groups() + %r8 = tail call spir_func i32 @get_sub_group_id() + %r9 = tail call spir_func i32 @get_sub_group_local_id() ret void } + +declare spir_func i64 @get_global_linear_id() +declare spir_func i64 @get_local_linear_id() +declare spir_func i32 @get_work_dim() +declare spir_func i32 @get_sub_group_size() +declare spir_func i32 @get_max_sub_group_size() +declare spir_func i32 @get_num_sub_groups() +declare spir_func i32 @get_enqueued_num_sub_groups() +declare spir_func i32 @get_sub_group_id() +declare spir_func i32 @get_sub_group_local_id() diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll index 075e63ea6de61..c6c8afc47dee3 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_shader_atomic_float_add/atomicrmw_faddfsub_float.ll @@ -1,6 +1,10 @@ ; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_shader_atomic_float_add %s -o - -filetype=obj | spirv-val %} ; CHECK-ERROR: LLVM ERROR: The atomic float instruction requires the following SPIR-V extension: SPV_EXT_shader_atomic_float_add @@ -25,9 +29,6 @@ ; CHECK: %[[Neg42:[0-9]+]] = OpFNegate %[[TyFP32]] %[[Const42]] ; CHECK: OpAtomicFAddEXT %[[TyFP32]] %[[DblPtr]] %[[ScopeWorkgroup]] %[[WorkgroupMemory]] %[[Neg42]] -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64" - @f = common dso_local local_unnamed_addr addrspace(1) global float 0.000000e+00, align 8 define dso_local spir_func void @test1() local_unnamed_addr { @@ -55,5 +56,31 @@ entry: declare spir_func float @_Z25atomic_fetch_add_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1), float, i32) declare spir_func float @_Z25atomic_fetch_sub_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1), float, i32) +; CHECK: %[[#Ptr1:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr1]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#Ptr2:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr2]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#Ptr3:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr3]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#Ptr4:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr4]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#Ptr5:]] = OpConvertUToPtr %[[TyFP32Ptr]] %[[#]] +; CHECK: %[[#]] = OpAtomicFAddEXT %[[TyFP32]] %[[#Ptr5]] %[[#]] %[[#]] %[[#]] + +define dso_local spir_func void @test4(i64 noundef %arg, float %val) local_unnamed_addr { +entry: + %ptr1 = inttoptr i64 %arg to float addrspace(1)* + %v1 = atomicrmw fadd ptr addrspace(1) %ptr1, float %val seq_cst, align 4 + %ptr2 = inttoptr i64 %arg to float addrspace(1)* + %v2 = atomicrmw fsub ptr addrspace(1) %ptr2, float %val seq_cst, align 4 + %ptr3 = inttoptr i64 %arg to float addrspace(1)* + %v3 = tail call spir_func float @_Z21__spirv_AtomicFAddEXT(ptr addrspace(1) %ptr3, i32 1, i32 16, float %val) + %ptr4 = inttoptr i64 %arg to float addrspace(1)* + %v4 = tail call spir_func float @_Z25atomic_fetch_add_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1) %ptr4, float %val, i32 0) + %ptr5 = inttoptr i64 %arg to float addrspace(1)* + %v5 = tail call spir_func float @_Z25atomic_fetch_sub_explicitPU3AS1VU7_Atomicff12memory_order(ptr addrspace(1) %ptr5, float %val, i32 0) + ret void +} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"wchar_size", i32 4} diff --git a/llvm/test/CodeGen/SPIRV/opencl/get_num_groups.ll b/llvm/test/CodeGen/SPIRV/opencl/get_num_groups.ll new file mode 100644 index 0000000000000..3f1d1dc248fc4 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/get_num_groups.ll @@ -0,0 +1,55 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s + +;; The set of valid inputs for get_num_groups depends on the runtime NDRange, +;; but inputs outside of [0, 2] always return 1. +;; Here we assume Itanium mangling for function name. +declare i64 @_Z14get_num_groupsj(i32) + +define i64 @foo(i32 %dim) { + %x = call i64 @_Z14get_num_groupsj(i32 0) + %y = call i64 @_Z14get_num_groupsj(i32 5) + %acc = add i64 %x, %y + %unknown = call i64 @_Z14get_num_groupsj(i32 %dim) + %ret = add i64 %acc, %unknown + ret i64 %ret +} + +;; Capabilities: +; CHECK-DAG: OpCapability Kernel +; CHECK-DAG: OpCapability Int64 + +; CHECK-NOT: DAG-FENCE + +;; Decorations: +; CHECK-DAG: OpDecorate %[[#GET_NUM_GROUPS:]] BuiltIn NumWorkgroups +; CHECK-DAG: OpDecorate %[[#GET_NUM_GROUPS]] Constant + +; CHECK-NOT: DAG-FENCE + +;; Types, Constants and Variables: +; CHECK-DAG: %[[#BOOL:]] = OpTypeBool +; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#I64:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#VEC:]] = OpTypeVector %[[#I64]] 3 +; CHECK-DAG: %[[#PTR:]] = OpTypePointer Input %[[#VEC]] +; CHECK-DAG: %[[#FN:]] = OpTypeFunction %[[#I64]] %[[#I32]] +; CHECK-DAG: %[[#GET_NUM_GROUPS]] = OpVariable %[[#PTR]] Input +; CHECK-DAG: %[[#ONE:]] = OpConstant %[[#I64]] 1 +; CHECK-DAG: %[[#THREE:]] = OpConstant %[[#I32]] 3 + +;; Functions: +; CHECK: OpFunction %[[#I64]] None %[[#FN]] +; CHECK: %[[#DIM:]] = OpFunctionParameter %[[#I32]] + +;; get_num_groups(0): OpLoad + OpCompositeExtract. +; CHECK: %[[#TMP1:]] = OpLoad %[[#VEC]] %[[#GET_NUM_GROUPS]] +; CHECK: %[[#X:]] = OpCompositeExtract %[[#I64]] %[[#TMP1]] 0 + +;; get_num_groups(5): OpConstant of one. +; CHECK: OpIAdd %[[#I64]] %[[#X]] %[[#ONE]] + +;; get_num_groups(dim): Implementation using OpSelect. +; CHECK-DAG: %[[#TMP2:]] = OpLoad %[[#VEC]] %[[#GET_NUM_GROUPS]] +; CHECK-DAG: %[[#TMP3:]] = OpVectorExtractDynamic %[[#I64]] %[[#TMP2]] %[[#DIM]] +; CHECK-DAG: %[[#COND:]] = OpULessThan %[[#BOOL]] %[[#DIM]] %[[#THREE]] +; CHECK: %[[#UNKNOWN:]] = OpSelect %[[#I64]] %[[#COND]] %[[#TMP3]] %[[#ONE]] diff --git a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll index 3e5a3ac356936..17a915e33c973 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll @@ -1,6 +1,9 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + ;; Check 'LLVM ==> SPIR-V' conversion of atomic_load and atomic_store. ; CHECK-SPIRV-LABEL: OpFunction @@ -17,17 +20,50 @@ entry: ; CHECK-SPIRV-LABEL: OpFunction ; CHECK-SPIRV-NEXT: %[[#object:]] = OpFunctionParameter %[[#]] -; CHECK-SPIRV-NEXT: OpFunctionParameter ; CHECK-SPIRV-NEXT: %[[#desired:]] = OpFunctionParameter %[[#]] ; CHECK-SPIRV: OpAtomicStore %[[#object]] %[[#]] %[[#]] %[[#desired]] ; CHECK-SPIRV-LABEL: OpFunctionEnd -define spir_func void @test_store(i32 addrspace(4)* %object, i32 addrspace(4)* %expected, i32 %desired) { +define spir_func void @test_store(i32 addrspace(4)* %object, i32 %desired) { entry: call spir_func void @_Z12atomic_storePVU3AS4U7_Atomicii(i32 addrspace(4)* %object, i32 %desired) ret void } declare spir_func i32 @_Z11atomic_loadPVU3AS4U7_Atomici(i32 addrspace(4)*) - declare spir_func void @_Z12atomic_storePVU3AS4U7_Atomicii(i32 addrspace(4)*, i32) + +; The goal of @test_typesX() cases is to ensure that a correct pointer type +; is deduced from the Value argument of OpAtomicLoad/OpAtomicStore. There is +; no need to add more pattern matching rules to be sure that the pointer type +; is valid, it's enough that `spirv-val` considers the output valid as it +; checks the same condition while validating the output. + +define spir_func void @test_types1(ptr addrspace(1) %ptr, float %val) { +entry: + %r = call spir_func float @atomic_load(ptr addrspace(1) %ptr) + ret void +} + +define spir_func void @test_types2(ptr addrspace(1) %ptr, float %val) { +entry: + call spir_func void @atomic_store(ptr addrspace(1) %ptr, float %val) + ret void +} + +define spir_func void @test_types3(i64 noundef %arg, float %val) { +entry: + %ptr1 = inttoptr i64 %arg to float addrspace(1)* + %r = call spir_func float @atomic_load(ptr addrspace(1) %ptr1) + ret void +} + +define spir_func void @test_types4(i64 noundef %arg, float %val) { +entry: + %ptr2 = inttoptr i64 %arg to float addrspace(1)* + call spir_func void @atomic_store(ptr addrspace(1) %ptr2, float %val) + ret void +} + +declare spir_func float @atomic_load(ptr addrspace(1)) +declare spir_func void @atomic_store(ptr addrspace(1), float) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll index 5074893163565..44d2f5e24f59d 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll @@ -1,8 +1,11 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; CHECK-SPIRV: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId -; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]] +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId +; CHECK: %[[#Id:]] = OpVariable %[[#]] @__spirv_BuiltInGlobalLinearId = external addrspace(1) global i32 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-types.ll index 8d99a0c6cd1ce..36ae6bf478127 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-types.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-types.ll @@ -85,5 +85,61 @@ define spir_func void @test_sampler(target("spirv.Image", float, 1, 1, 0, 0, 0, } declare spir_func target("spirv.Image", float, 1, 1, 0, 0, 0, 0, 0) @_Z20__spirv_SampledImagePU3AS1K34__spirv_Image__float_1_1_0_0_0_0_0PU3AS1K15__spirv_Sampler(target("spirv.Image", float, 1, 1, 0, 0, 0, 0, 0), target("spirv.Sampler")) - declare spir_func <4 x float> @_Z38__spirv_ImageSampleExplicitLod_Rfloat4PU3AS120__spirv_SampledImageDv4_iif(target("spirv.Image", float, 1, 1, 0, 0, 0, 0, 0), <4 x i32>, i32, float) + +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageRead +; CHECK-SPIRV: %[[#]] = OpImageSampleExplicitLod + +define dso_local spir_kernel void @reads() { + %1 = tail call spir_func i32 @_Z17__spirv_ImageReadIi14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) poison, <4 x i32> zeroinitializer) + %2 = tail call spir_func <2 x i32> @_Z17__spirv_ImageReadIDv2_i14ocl_image2d_roS0_ET_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison, <2 x i32> zeroinitializer) + %3 = tail call spir_func <4 x i32> @_Z17__spirv_ImageReadIDv4_j14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) poison, <4 x i32> zeroinitializer) + %4 = tail call spir_func signext i16 @_Z17__spirv_ImageReadIs14ocl_image1d_roiET_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) poison, i32 0) + %5 = tail call spir_func zeroext i16 @_Z17__spirv_ImageReadIt14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0) poison, <4 x i32> zeroinitializer) + %6 = tail call spir_func <2 x float> @_Z17__spirv_ImageReadIDv2_f14ocl_image1d_roiET_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) poison, i32 0) + %7 = tail call spir_func half @_Z17__spirv_ImageReadIDF16_14ocl_image2d_roDv2_iET_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison, <2 x i32> zeroinitializer) + %8 = tail call spir_func <4 x i32> @_Z30__spirv_ImageSampleExplicitLodI32__spirv_SampledImage__image1d_roDv4_jfET0_T_T1_if(target("spirv.SampledImage", void, 0, 0, 0, 0, 0, 0, 0) poison, float 0.000000e+00, i32 2, float 0.000000e+00) + ret void +} + +declare dso_local spir_func i32 @_Z17__spirv_ImageReadIi14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), <4 x i32>) +declare dso_local spir_func <2 x i32> @_Z17__spirv_ImageReadIDv2_i14ocl_image2d_roS0_ET_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), <2 x i32>) +declare dso_local spir_func <4 x i32> @_Z17__spirv_ImageReadIDv4_j14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), <4 x i32>) +declare dso_local spir_func signext i16 @_Z17__spirv_ImageReadIs14ocl_image1d_roiET_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0), i32) +declare dso_local spir_func zeroext i16 @_Z17__spirv_ImageReadIt14ocl_image3d_roDv4_iET_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 0), <4 x i32>) +declare dso_local spir_func <2 x float> @_Z17__spirv_ImageReadIDv2_f14ocl_image1d_roiET_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0), i32) +declare dso_local spir_func half @_Z17__spirv_ImageReadIDF16_14ocl_image2d_roDv2_iET_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), <2 x i32>) +declare dso_local spir_func <4 x i32> @_Z30__spirv_ImageSampleExplicitLodI32__spirv_SampledImage__image1d_roDv4_jfET0_T_T1_if(target("spirv.SampledImage", void, 0, 0, 0, 0, 0, 0, 0), float noundef, i32 noundef, float noundef) + +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite +; CHECK-SPIRV: OpImageWrite + +define dso_local spir_kernel void @writes() { + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_iiEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) poison, <4 x i32> zeroinitializer, i32 zeroinitializer) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image2d_woDv2_iS1_EvT_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) poison, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_iDv4_jEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) poison, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image1d_woisEvT_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) poison, i32 0, i16 signext 0) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_itEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1) poison, <4 x i32> zeroinitializer, i16 zeroext 0) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image1d_woiDv2_fEvT_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) poison, i32 0, <2 x float> zeroinitializer) + call spir_func void @_Z18__spirv_ImageWriteI14ocl_image2d_woDv2_iDF16_EvT_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1) poison, <2 x i32> zeroinitializer, half zeroinitializer) + ret void +} + +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_iiEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32>, i32) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image2d_woDv2_iS1_EvT_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, <2 x i32>) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_iDv4_jEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32>, <4 x i32>) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image1d_woisEvT_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1), i32, i16 signext) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image3d_woDv4_itEvT_T0_T1_(target("spirv.Image", void, 2, 0, 0, 0, 0, 0, 1), <4 x i32>, i16 zeroext) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image1d_woiDv2_fEvT_T0_T1_(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1), i32, <2 x float>) +declare dso_local spir_func void @_Z18__spirv_ImageWriteI14ocl_image2d_woDv2_iDF16_EvT_T0_T1_(target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 1), <2 x i32>, half) diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 53517373d3e4d..e513b666ebf83 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -660,21 +660,25 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { ret <8 x half> %r } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: shlq $52, %rdi -; CHECK-SSE-NEXT: movabsq $4621256167635550208, %rax # imm = 0x4022000000000000 -; CHECK-SSE-NEXT: addq %rdi, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000 +; CHECK-SSE-NEXT: addq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_shl_cnt: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: shlq $52, %rdi -; CHECK-AVX-NEXT: movabsq $4621256167635550208, %rax # imm = 0x4022000000000000 -; CHECK-AVX-NEXT: addq %rdi, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000 +; CHECK-AVX-NEXT: addq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i64 1, %cnt %conv = uitofp i64 %shl to double @@ -682,23 +686,27 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { ret double %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: incl %edi -; CHECK-SSE-NEXT: shlq $52, %rdi -; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rax # imm = 0xC022000000000000 -; CHECK-SSE-NEXT: addq %rdi, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: incl %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-SSE-NEXT: addq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_shl_cnt2: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: incl %edi -; CHECK-AVX-NEXT: shlq $52, %rdi -; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rax # imm = 0xC022000000000000 -; CHECK-AVX-NEXT: addq %rdi, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: incl %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-AVX-NEXT: addq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i64 2, %cnt %conv = uitofp i64 %shl to double @@ -706,27 +714,55 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { ret double %mul } +; Make sure we do a movzbl of the input register. +define double @fmul_pow_shl_cnt3(i8 %cnt) nounwind { +; CHECK-SSE-LABEL: fmul_pow_shl_cnt3: +; CHECK-SSE: # %bb.0: +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-SSE-NEXT: addq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 +; CHECK-SSE-NEXT: retq +; +; CHECK-AVX-LABEL: fmul_pow_shl_cnt3: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000 +; CHECK-AVX-NEXT: addq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 +; CHECK-AVX-NEXT: retq + %zext_cnt = zext i8 %cnt to i64 + %shl = shl nuw i64 1, %zext_cnt + %conv = uitofp i64 %shl to double + %mul = fmul double -9.000000e+00, %conv + ret double %mul +} + +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ; CHECK-SSE-LABEL: fmul_pow_select: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-SSE-NEXT: leal 1(%rdi), %eax +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: leal 1(%rax), %ecx ; CHECK-SSE-NEXT: testb $1, %sil -; CHECK-SSE-NEXT: cmovnel %edi, %eax -; CHECK-SSE-NEXT: shll $23, %eax -; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-SSE-NEXT: movd %eax, %xmm0 +; CHECK-SSE-NEXT: cmovnel %eax, %ecx +; CHECK-SSE-NEXT: shll $23, %ecx +; CHECK-SSE-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-SSE-NEXT: movd %ecx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_select: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-AVX-NEXT: leal 1(%rdi), %eax +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: leal 1(%rax), %ecx ; CHECK-AVX-NEXT: testb $1, %sil -; CHECK-AVX-NEXT: cmovnel %edi, %eax -; CHECK-AVX-NEXT: shll $23, %eax -; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: cmovnel %eax, %ecx +; CHECK-AVX-NEXT: shll $23, %ecx +; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq %shl2 = shl nuw i32 2, %cnt %shl1 = shl nuw i32 1, %cnt @@ -736,27 +772,31 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { ret float %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: addl $3, %edi -; CHECK-SSE-NEXT: cmpl $13, %edi -; CHECK-SSE-NEXT: movl $13, %eax -; CHECK-SSE-NEXT: cmovbl %edi, %eax -; CHECK-SSE-NEXT: shll $23, %eax -; CHECK-SSE-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-SSE-NEXT: movd %eax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: addl $3, %eax +; CHECK-SSE-NEXT: cmpl $13, %eax +; CHECK-SSE-NEXT: movl $13, %ecx +; CHECK-SSE-NEXT: cmovbl %eax, %ecx +; CHECK-SSE-NEXT: shll $23, %ecx +; CHECK-SSE-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-SSE-NEXT: movd %ecx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_fly_pow_mul_min_pow2: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: addl $3, %edi -; CHECK-AVX-NEXT: cmpl $13, %edi -; CHECK-AVX-NEXT: movl $13, %eax -; CHECK-AVX-NEXT: cmovbl %edi, %eax -; CHECK-AVX-NEXT: shll $23, %eax -; CHECK-AVX-NEXT: addl $1091567616, %eax # imm = 0x41100000 -; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: addl $3, %eax +; CHECK-AVX-NEXT: cmpl $13, %eax +; CHECK-AVX-NEXT: movl $13, %ecx +; CHECK-AVX-NEXT: cmovbl %eax, %ecx +; CHECK-AVX-NEXT: shll $23, %ecx +; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000 +; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq %shl8 = shl nuw i64 8, %cnt %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) @@ -765,28 +805,30 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { ret float %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: movl %edi, %eax +; CHECK-SSE-NEXT: movzbl %dil, %eax ; CHECK-SSE-NEXT: leaq 1(%rax), %rcx ; CHECK-SSE-NEXT: cmpq %rcx, %rax ; CHECK-SSE-NEXT: cmovaq %rax, %rcx ; CHECK-SSE-NEXT: shlq $52, %rcx ; CHECK-SSE-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 -; CHECK-SSE-NEXT: addq %rcx, %rax +; CHECK-SSE-NEXT: orq %rcx, %rax ; CHECK-SSE-NEXT: movq %rax, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_mul_max_pow2: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: movl %edi, %eax +; CHECK-AVX-NEXT: movzbl %dil, %eax ; CHECK-AVX-NEXT: leaq 1(%rax), %rcx ; CHECK-AVX-NEXT: cmpq %rcx, %rax ; CHECK-AVX-NEXT: cmovaq %rax, %rcx ; CHECK-AVX-NEXT: shlq $52, %rcx ; CHECK-AVX-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000 -; CHECK-AVX-NEXT: addq %rcx, %rax +; CHECK-AVX-NEXT: orq %rcx, %rax ; CHECK-AVX-NEXT: vmovq %rax, %xmm0 ; CHECK-AVX-NEXT: retq %shl2 = shl nuw i16 2, %cnt @@ -1161,23 +1203,25 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { ret double %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { ; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-SSE-NEXT: shlq $52, %rdi -; CHECK-SSE-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 -; CHECK-SSE-NEXT: addq %rdi, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992 +; CHECK-SSE-NEXT: addq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fmul_pow_shl_cnt_safe: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-AVX-NEXT: shlq $52, %rdi -; CHECK-AVX-NEXT: movabsq $8930638061065157010, %rax # imm = 0x7BEFFFFFFF5F3992 -; CHECK-AVX-NEXT: addq %rdi, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992 +; CHECK-AVX-NEXT: addq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i16 1, %cnt %conv = uitofp i16 %shl to double @@ -1236,15 +1280,15 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-SSE-NEXT: shlq %cl, %rax ; CHECK-SSE-NEXT: testq %rax, %rax -; CHECK-SSE-NEXT: js .LBB22_1 +; CHECK-SSE-NEXT: js .LBB23_1 ; CHECK-SSE-NEXT: # %bb.2: ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 -; CHECK-SSE-NEXT: jmp .LBB22_3 -; CHECK-SSE-NEXT: .LBB22_1: +; CHECK-SSE-NEXT: jmp .LBB23_3 +; CHECK-SSE-NEXT: .LBB23_1: ; CHECK-SSE-NEXT: shrq %rax ; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1 ; CHECK-SSE-NEXT: addss %xmm1, %xmm1 -; CHECK-SSE-NEXT: .LBB22_3: +; CHECK-SSE-NEXT: .LBB23_3: ; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-SSE-NEXT: divss %xmm1, %xmm0 ; CHECK-SSE-NEXT: retq @@ -1256,15 +1300,15 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax ; CHECK-AVX2-NEXT: testq %rax, %rax -; CHECK-AVX2-NEXT: js .LBB22_1 +; CHECK-AVX2-NEXT: js .LBB23_1 ; CHECK-AVX2-NEXT: # %bb.2: ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: jmp .LBB22_3 -; CHECK-AVX2-NEXT: .LBB22_1: +; CHECK-AVX2-NEXT: jmp .LBB23_3 +; CHECK-AVX2-NEXT: .LBB23_1: ; CHECK-AVX2-NEXT: shrq %rax ; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: .LBB22_3: +; CHECK-AVX2-NEXT: .LBB23_3: ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1545,23 +1589,25 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ret half %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-SSE-NEXT: shlq $52, %rdi -; CHECK-SSE-NEXT: movabsq $3936146074321813504, %rax # imm = 0x36A0000000000000 -; CHECK-SSE-NEXT: subq %rdi, %rax -; CHECK-SSE-NEXT: movq %rax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shlq $52, %rax +; CHECK-SSE-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000 +; CHECK-SSE-NEXT: subq %rax, %rcx +; CHECK-SSE-NEXT: movq %rcx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-AVX-NEXT: shlq $52, %rdi -; CHECK-AVX-NEXT: movabsq $3936146074321813504, %rax # imm = 0x36A0000000000000 -; CHECK-AVX-NEXT: subq %rdi, %rax -; CHECK-AVX-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shlq $52, %rax +; CHECK-AVX-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000 +; CHECK-AVX-NEXT: subq %rax, %rcx +; CHECK-AVX-NEXT: vmovq %rcx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to double @@ -1617,21 +1663,25 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ret float %mul } +; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set +; in the original IR. define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind { ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: shll $23, %edi -; CHECK-SSE-NEXT: movl $285212672, %eax # imm = 0x11000000 -; CHECK-SSE-NEXT: subl %edi, %eax -; CHECK-SSE-NEXT: movd %eax, %xmm0 +; CHECK-SSE-NEXT: movzbl %dil, %eax +; CHECK-SSE-NEXT: shll $23, %eax +; CHECK-SSE-NEXT: movl $285212672, %ecx # imm = 0x11000000 +; CHECK-SSE-NEXT: subl %eax, %ecx +; CHECK-SSE-NEXT: movd %ecx, %xmm0 ; CHECK-SSE-NEXT: retq ; ; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_okay: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: shll $23, %edi -; CHECK-AVX-NEXT: movl $285212672, %eax # imm = 0x11000000 -; CHECK-AVX-NEXT: subl %edi, %eax -; CHECK-AVX-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-NEXT: movzbl %dil, %eax +; CHECK-AVX-NEXT: shll $23, %eax +; CHECK-AVX-NEXT: movl $285212672, %ecx # imm = 0x11000000 +; CHECK-AVX-NEXT: subl %eax, %ecx +; CHECK-AVX-NEXT: vmovd %ecx, %xmm0 ; CHECK-AVX-NEXT: retq %shl = shl nuw i32 1, %cnt %conv = uitofp i32 %shl to float diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 675412defbb24..ba51c65ccab13 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -2982,223 +2982,182 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX2-LABEL: store_i8_stride8_vf32: ; AVX2: # %bb.0: -; AVX2-NEXT: subq $88, %rsp +; AVX2-NEXT: subq $40, %rsp ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rsi), %xmm2 -; AVX2-NEXT: vmovdqa (%rdi), %xmm3 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa (%rcx), %xmm4 -; AVX2-NEXT: vmovdqa (%rdx), %xmm5 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX2-NEXT: vmovdqa (%r10), %xmm6 -; AVX2-NEXT: vmovdqa (%rax), %xmm7 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm9 -; AVX2-NEXT: vmovdqa (%r9), %xmm10 -; AVX2-NEXT: vmovdqa (%r8), %xmm11 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm8, %ymm15 -; AVX2-NEXT: vmovaps 16(%rsi), %xmm8 -; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5,6,7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13,14,15] -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4],ymm15[5],ymm0[6],ymm15[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 -; AVX2-NEXT: vmovdqa 16(%rdx), %xmm15 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5],ymm13[6],ymm1[7],ymm13[8,9,10,11,12],ymm1[13],ymm13[14],ymm1[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm7 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2,3,4],ymm2[5],ymm7[6],ymm2[7],ymm7[8,9,10,11,12],ymm2[13],ymm7[14],ymm2[15] +; AVX2-NEXT: vmovdqa (%r10), %xmm3 +; AVX2-NEXT: vmovdqa (%rax), %xmm4 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa (%r9), %xmm6 +; AVX2-NEXT: vmovdqa (%r8), %xmm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15] +; AVX2-NEXT: vmovdqa (%rsi), %xmm5 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero +; AVX2-NEXT: vmovdqa (%rcx), %xmm8 +; AVX2-NEXT: vmovdqa (%rdx), %xmm10 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7] +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7,8],ymm15[9],ymm9[10,11,12],ymm15[13],ymm9[14,15] +; AVX2-NEXT: vmovaps 16(%r10), %xmm7 +; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm12[1],ymm15[2],ymm12[3],ymm15[4],ymm12[5],ymm15[6],ymm12[7] +; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill +; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX2-NEXT: vmovdqa 16(%rax), %xmm12 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0,1,2,3,4],ymm2[5],ymm11[6],ymm2[7],ymm11[8,9,10,11,12],ymm2[13],ymm11[14],ymm2[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11 +; AVX2-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7,8],ymm11[9],ymm13[10,11,12],ymm11[13],ymm13[14,15] +; AVX2-NEXT: vmovdqa 16(%r8), %xmm15 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-NEXT: vmovdqa %xmm9, %xmm5 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX2-NEXT: vmovdqa %xmm8, %xmm9 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX2-NEXT: vmovdqa 16(%r10), %xmm8 -; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3,4],ymm13[5],ymm7[6,7,8],ymm13[9],ymm7[10,11,12],ymm13[13],ymm7[14,15] -; AVX2-NEXT: vmovdqa 16(%rax), %xmm4 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 -; AVX2-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX2-NEXT: vmovdqa 16(%r8), %xmm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm14[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7] +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm4, %ymm11 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6],ymm11[7],ymm0[8,9,10,11,12],ymm11[13],ymm0[14],ymm11[15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8 +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm15, %ymm7 -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7,8],ymm7[9],ymm0[10,11,12],ymm7[13],ymm0[14,15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6],ymm5[7],ymm3[8,9,10,11,12],ymm5[13],ymm3[14],ymm5[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5,6,7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5,6,7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm8, %ymm8 +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6],ymm0[7],ymm8[8,9,10,11,12],ymm0[13],ymm8[14],ymm0[15] +; AVX2-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX2-NEXT: vmovdqa 16(%rdx), %xmm4 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6],ymm6[7],ymm14[8,9,10,11,12],ymm6[13],ymm14[14],ymm6[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm4, %ymm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7] +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5,6,7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13,14,15] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpshufb %xmm5, %xmm13, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5,6,7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13,14,15] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload +; AVX2-NEXT: # ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX2-NEXT: vmovdqa %ymm2, 128(%rax) -; AVX2-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX2-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX2-NEXT: vmovdqa %ymm1, 128(%rax) +; AVX2-NEXT: vmovdqa %ymm12, 192(%rax) ; AVX2-NEXT: vmovdqa %ymm0, 224(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm0, 160(%rax) +; AVX2-NEXT: vmovdqa %ymm8, 160(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 96(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, (%rax) -; AVX2-NEXT: addq $88, %rsp +; AVX2-NEXT: addq $40, %rsp ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3508,166 +3467,134 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: vmovdqa (%r10), %xmm1 ; AVX512-NEXT: vmovdqa 16(%r10), %xmm11 -; AVX512-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-NEXT: vmovdqa (%rax), %xmm5 ; AVX512-NEXT: vmovdqa 16(%rax), %xmm12 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vmovdqa (%r9), %xmm3 -; AVX512-NEXT: vmovdqa 16(%r9), %xmm13 -; AVX512-NEXT: vmovdqa (%r8), %xmm4 -; AVX512-NEXT: vmovdqa 16(%r8), %xmm14 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vmovdqa (%r9), %xmm6 +; AVX512-NEXT: vmovdqa 16(%r9), %xmm14 +; AVX512-NEXT: vmovdqa (%r8), %xmm7 +; AVX512-NEXT: vmovdqa 16(%r8), %xmm15 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm25 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512-NEXT: vmovdqa (%rdx), %xmm9 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm15, %ymm3 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm10 -; AVX512-NEXT: vmovdqa 16(%rcx), %xmm5 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm10[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7,8],ymm13[9],ymm3[10,11,12],ymm13[13],ymm3[14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm13, %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm18 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa 16(%rcx), %xmm2 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3],ymm10[4,5,6],ymm1[7],ymm10[8,9,10],ymm1[11],ymm10[12,13,14],ymm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6],ymm3[7],ymm1[8,9,10],ymm3[11],ymm1[12,13,14],ymm3[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm3, %ymm3 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] ; AVX512-NEXT: vmovdqa 16(%rdx), %xmm10 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm15 -; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm19 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm20 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7,8],ymm15[9],ymm1[10,11,12],ymm15[13],ymm1[14,15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm15, %ymm0 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm15 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm3[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm6, %ymm6 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm1, %ymm19 +; AVX512-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7,8],ymm4[9],ymm1[10,11,12],ymm4[13],ymm1[14,15] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm4[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm11, %ymm11 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm11, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3],ymm6[4,5,6],ymm1[7],ymm6[8,9,10],ymm1[11],ymm6[12,13,14],ymm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 +; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3],ymm11[4,5,6],ymm1[7],ymm11[8,9,10],ymm1[11],ymm11[12,13,14],ymm1[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm11, %ymm4 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 @@ -3684,40 +3611,32 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7,8],ymm2[9],ymm5[10,11,12],ymm2[13],ymm5[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512-NEXT: kmovw %ecx, %k1 ; AVX512-NEXT: vmovdqa32 %zmm16, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa32 %zmm17, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -3890,212 +3809,176 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm19 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm12 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm21 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm7, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm16 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm6 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm9 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm7 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm14[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm15, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6],ymm9[7],ymm1[8,9,10],ymm9[11],ymm1[12,13,14],ymm9[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5,6],ymm0[7],ymm9[8,9,10],ymm0[11],ymm9[12,13,14],ymm0[15] -; AVX512DQ-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm16 {%k1} -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm14, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm14 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm9, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm15 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm10 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 -; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm3 -; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm9 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm11 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7],ymm7[8,9,10],ymm0[11],ymm7[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7],ymm2[8,9,10],ymm4[11],ymm2[12,13,14],ymm4[15] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7],ymm0[8,9,10],ymm1[11],ymm0[12,13,14],ymm1[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm17 {%k1} -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm10 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm1[3],ymm5[4,5,6],ymm1[7],ymm5[8,9,10],ymm1[11],ymm5[12,13,14],ymm1[15] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7,8],ymm12[9],ymm15[10,11,12],ymm12[13],ymm15[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7,8],ymm13[9],ymm14[10,11,12],ymm13[13],ymm14[14,15] +; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm14 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm16 +; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm5, %zmm16 {%k1} +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm5, %ymm17 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm5[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm18 +; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm13 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm12, %ymm19 +; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm15 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm20 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm0, %ymm21 +; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7,8],ymm12[9],ymm2[10,11,12],ymm12[13],ymm2[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7],ymm2[8,9,10],ymm0[11],ymm2[12,13,14],ymm0[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm10 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7],ymm3[8,9,10],ymm2[11],ymm3[12,13,14],ymm2[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm2, %zmm12 {%k1} +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm11 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0,0,1,1,2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7,8],ymm9[9],ymm5[10,11,12],ymm9[13],ymm5[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm7 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7,8],ymm5[9],ymm7[10,11,12],ymm5[13],ymm7[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7,8],ymm6[9],ymm4[10,11,12],ymm6[13],ymm4[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7],ymm3[8,9,10],ymm1[11],ymm3[12,13,14],ymm1[15] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k1} +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm4 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -4266,102 +4149,58 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] -; AVX512BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqa (%r11), %xmm6 -; AVX512BW-NEXT: vmovdqa 16(%r11), %xmm15 -; AVX512BW-NEXT: vmovdqa (%r10), %xmm7 -; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm17 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm18 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm9 -; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm19 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20 +; AVX512BW-NEXT: vmovdqa (%r11), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%r11), %xmm1 +; AVX512BW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512BW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512BW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512BW-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512BW-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512BW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512BW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm18 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15] -; AVX512BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10 -; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3 -; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 +; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -4430,102 +4269,58 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rcx), %xmm13 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa 16(%rdx), %xmm14 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm7[1],ymm5[2,3,4],ymm7[5],ymm5[6,7,8],ymm7[9],ymm5[10,11,12],ymm7[13],ymm5[14,15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [4,20,0,0,5,21,0,0,6,22,0,0,7,23,0,0] -; AVX512DQ-BW-NEXT: vpermt2w %ymm6, %ymm22, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm6 -; AVX512DQ-BW-NEXT: vmovdqa 16(%r11), %xmm15 -; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm17 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm18 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm19 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm24, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r11), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r10), %xmm3 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-BW-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rcx), %xmm11 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdx), %xmm13 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm18 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm19 ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm16[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm5 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm21 = xmm20[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm21 = xmm21[0],zero,xmm21[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm23 = xmm20[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm23 = xmm23[0],zero,xmm23[1],zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm21, %ymm10 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm20, %ymm22, %ymm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm5, %zmm16 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm19[0],xmm18[0],xmm19[1],xmm18[1],xmm19[2],xmm18[2],xmm19[3],xmm18[3],xmm19[4],xmm18[4],xmm19[5],xmm18[5],xmm19[6],xmm18[6],xmm19[7],xmm18[7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm24, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm13[0],zero,xmm13[1],zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm12, %ymm12 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7,8],ymm12[9],ymm10[10,11,12],ymm12[13],ymm10[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm11, %ymm22, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm19[8],xmm18[8],xmm19[9],xmm18[9],xmm19[10],xmm18[10],xmm19[11],xmm18[11],xmm19[12],xmm18[12],xmm19[13],xmm18[13],xmm19[14],xmm18[14],xmm19[15],xmm18[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm4 -; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm3, %ymm22, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm24, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm9, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm20, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm10, %zmm3 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm1 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm20, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -5923,407 +5718,336 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: subq $328, %rsp # imm = 0x148 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: vmovdqa (%rsi), %xmm0 +; AVX2-NEXT: vmovdqa (%r10), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 48(%r10), %xmm11 +; AVX2-NEXT: vmovdqa (%rax), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rdx), %xmm3 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %xmm3 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vmovdqa (%r10), %xmm0 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6],ymm1[7],ymm4[8,9,10,11,12],ymm1[13],ymm4[14],ymm1[15] +; AVX2-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%rax), %xmm1 +; AVX2-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa (%r9), %xmm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vmovdqa (%rsi), %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa (%r8), %xmm6 +; AVX2-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-NEXT: vmovdqa 48(%rsi), %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm1[5],ymm7[6],ymm1[7],ymm7[8,9,10,11,12],ymm1[13],ymm7[14],ymm1[15] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm5 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7,8],ymm1[9],ymm7[10,11,12],ymm1[13],ymm7[14,15] +; AVX2-NEXT: vmovdqa 48(%rax), %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5,6,7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7] +; AVX2-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa 48(%r9), %xmm3 +; AVX2-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5,6,7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13,14,15] ; AVX2-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7,8],ymm3[9],ymm5[10,11,12],ymm3[13],ymm5[14,15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm4 -; AVX2-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX2-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm12 -; AVX2-NEXT: vmovdqa 48(%r9), %xmm6 -; AVX2-NEXT: vmovdqa 48(%r8), %xmm7 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm14[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm15, %ymm13 -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5,6,7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] +; AVX2-NEXT: vmovdqa 48(%rdx), %xmm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm6 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm13 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX2-NEXT: vmovdqa 48(%rsi), %xmm6 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7,8],ymm13[9],ymm15[10,11,12],ymm13[13],ymm15[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4],ymm10[5],ymm13[6],ymm10[7] ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6],ymm8[7],ymm9[8,9,10,11,12],ymm8[13],ymm9[14],ymm8[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7,8],ymm9[9],ymm0[10,11,12],ymm9[13],ymm0[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6],ymm0[7],ymm1[8,9,10,11,12],ymm0[13],ymm1[14],ymm0[15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6],ymm0[7],ymm2[8,9,10,11,12],ymm0[13],ymm2[14],ymm0[15] -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 32(%r10), %xmm0 +; AVX2-NEXT: vmovdqa 32(%rax), %xmm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm4 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX2-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5,6,7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13,14,15] ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX2-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7,8],ymm5[9],ymm4[10,11,12],ymm5[13],ymm4[14,15] -; AVX2-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX2-NEXT: vmovdqa 32(%rax), %xmm5 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm12 -; AVX2-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX2-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX2-NEXT: vmovdqa 32(%rdx), %xmm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm6 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14 -; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5,6,7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7,8],ymm12[9],ymm15[10,11,12],ymm12[13],ymm15[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7] ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9 -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7,8],ymm9[9],ymm8[10,11,12],ymm9[13],ymm8[14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6],ymm8[7],ymm9[8,9,10,11,12],ymm8[13],ymm9[14],ymm8[15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6],ymm9[7],ymm10[8,9,10,11,12],ymm9[13],ymm10[14],ymm9[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7,8],ymm9[9],ymm10[10,11,12],ymm9[13],ymm10[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7] ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5,6,7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4],ymm4[5],ymm1[6],ymm4[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6],ymm0[7],ymm1[8,9,10,11,12],ymm0[13],ymm1[14],ymm0[15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6],ymm1[7],ymm2[8,9,10,11,12],ymm1[13],ymm2[14],ymm1[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovdqa 16(%rsi), %xmm14 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm11 -; AVX2-NEXT: vmovdqa 16(%rdx), %xmm9 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-NEXT: vmovdqa 16(%r10), %xmm7 -; AVX2-NEXT: vmovdqa 16(%rax), %xmm6 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 +; AVX2-NEXT: vmovdqa 16(%r10), %xmm12 +; AVX2-NEXT: vmovdqa 16(%rax), %xmm10 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 16(%r9), %xmm5 ; AVX2-NEXT: vmovdqa 16(%r8), %xmm4 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5,6,7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2],ymm2[3],ymm10[4],ymm2[5],ymm10[6],ymm2[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7],ymm0[8,9,10,11,12],ymm2[13],ymm0[14],ymm2[15] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5,6,7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13,14,15] +; AVX2-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX2-NEXT: vpshufb %xmm14, %xmm15, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7,8],ymm0[9],ymm13[10,11,12],ymm0[13],ymm13[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm9[1],ymm0[2],ymm9[3],ymm0[4],ymm9[5],ymm0[6],ymm9[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm8, %ymm8 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6],ymm0[7],ymm8[8,9,10,11,12],ymm0[13],ymm8[14],ymm0[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm8, %ymm8 +; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7,8],ymm8[9],ymm1[10,11,12],ymm8[13],ymm1[14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15] -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5,6,7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4],ymm5[5],ymm0[6],ymm5[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4,5,6,7,8],ymm0[9],ymm5[10],ymm0[11],ymm5[12,13,14,15] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX2-NEXT: vpshufb %xmm14, %xmm3, %xmm5 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,5,7] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6],ymm2[7],ymm3[8,9,10,11,12],ymm2[13],ymm3[14],ymm2[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6],ymm1[7],ymm4[8,9,10,11,12],ymm1[13],ymm4[14],ymm1[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX2-NEXT: # xmm2 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4,5,6,7,8],ymm3[9],ymm5[10],ymm3[11],ymm5[12,13,14,15] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-NEXT: # xmm5 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6 -; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,1,3,4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm9 -; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2],ymm6[3],ymm9[4,5,6,7,8],ymm6[9],ymm9[10],ymm6[11],ymm9[12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4 -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4],ymm4[5],ymm2[6,7,8],ymm4[9],ymm2[10,11,12],ymm4[13],ymm2[14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,4,6,5] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,7,7] -; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6],ymm4[7],ymm5[8,9,10,11,12],ymm4[13],ymm5[14],ymm4[15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX2-NEXT: vpshufb %xmm14, %xmm6, %xmm7 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7,8],ymm7[9],ymm10[10,11,12],ymm7[13],ymm10[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm2 +; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6],ymm2[7],ymm4[8,9,10,11,12],ymm2[13],ymm4[14],ymm2[15] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,4,6,5] +; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-NEXT: vmovdqa %ymm1, 160(%rax) ; AVX2-NEXT: vmovdqa %ymm0, 128(%rax) ; AVX2-NEXT: vmovdqa %ymm8, 224(%rax) -; AVX2-NEXT: vmovdqa %ymm10, 192(%rax) +; AVX2-NEXT: vmovdqa %ymm9, 192(%rax) ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vmovaps %ymm0, 288(%rax) ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6992,147 +6716,146 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride8_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX512-NEXT: subq $552, %rsp # imm = 0x228 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX512-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX512-NEXT: vmovdqa (%rax), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%r9), %xmm7 -; AVX512-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 48(%r8), %xmm12 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm9 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm11 -; AVX512-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm31 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm28 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm21 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm20 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm30 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm24 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm22 -; AVX512-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm18 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512-NEXT: vmovdqa 48(%rcx), %xmm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%r10), %xmm5 +; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 48(%r10), %xmm3 +; AVX512-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 48(%rax), %xmm4 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %xmm5 +; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 48(%r9), %xmm6 +; AVX512-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 48(%r8), %xmm8 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm9, %ymm26 +; AVX512-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm28 +; AVX512-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm9, %ymm29 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm25 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm23 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm19 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm17 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm16 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm14 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm3, %ymm20 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm18 +; AVX512-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm15 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm13 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm12[8],xmm5[9],xmm12[9],xmm5[10],xmm12[10],xmm5[11],xmm12[11],xmm5[12],xmm12[12],xmm5[13],xmm12[13],xmm5[14],xmm12[14],xmm5[15],xmm12[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 @@ -7141,334 +6864,245 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 16(%rcx), %xmm14 -; AVX512-NEXT: vmovdqa 16(%rdx), %xmm12 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX512-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 16(%r10), %xmm1 -; AVX512-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX512-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm1, %ymm30 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 16(%r9), %xmm0 -; AVX512-NEXT: vmovdqa 16(%r8), %xmm15 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm27 -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm13 & (zmm10 ^ zmm3)) -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512-NEXT: vpandnq %zmm4, %zmm3, %zmm4 -; AVX512-NEXT: vpandq %zmm3, %zmm5, %zmm5 +; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm31 +; AVX512-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm11, %ymm1, %ymm27 +; AVX512-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; AVX512-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512-NEXT: vmovdqa64 %xmm7, %xmm21 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm22 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero,xmm11[4],zero,zero,zero,xmm11[5],zero,zero,zero,xmm11[6],zero,zero,zero,xmm11[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm10 +; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512-NEXT: vpandq %zmm7, %zmm10, %zmm10 ; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpord %zmm4, %zmm5, %zmm10 {%k1} -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm13 & (zmm9 ^ zmm0)) +; AVX512-NEXT: vpord %zmm0, %zmm10, %zmm22 {%k1} +; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm13 & (zmm11 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm10, %zmm10 +; AVX512-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512-NEXT: vpandq %zmm7, %zmm10, %zmm10 +; AVX512-NEXT: vpord %zmm0, %zmm10, %zmm24 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm1 +; AVX512-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512-NEXT: vpandq %zmm7, %zmm1, %zmm1 +; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm10 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm17 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpandq %zmm3, %zmm6, %zmm6 -; AVX512-NEXT: vpord %zmm0, %zmm6, %zmm4 {%k1} -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm19 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm3 +; AVX512-NEXT: vpandnq %zmm1, %zmm7, %zmm1 +; AVX512-NEXT: vpandq %zmm7, %zmm3, %zmm3 +; AVX512-NEXT: vpord %zmm1, %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm2, %ymm21 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm22 -; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm6, %ymm7, %ymm19 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX512-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm14 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm15 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX512-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm12, %ymm8, %ymm17 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm8, %ymm20 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm8 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm20 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm8, %ymm3, %ymm23 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm9 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm6, %ymm13 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm15 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm6, %ymm16 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm1, %ymm18 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,1,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12)) -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12 -; AVX512-NEXT: vpandnq %zmm12, %zmm3, %zmm12 -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512-NEXT: vpandq %zmm3, %zmm23, %zmm23 -; AVX512-NEXT: vpord %zmm12, %zmm23, %zmm5 {%k1} -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm23 = xmm7[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm24 = xmm7[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm13 & (zmm12 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX512-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero +; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = mem[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512-NEXT: # ymm25 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 -; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512-NEXT: vpandnq %zmm25, %zmm3, %zmm25 -; AVX512-NEXT: vpandq %zmm3, %zmm16, %zmm16 -; AVX512-NEXT: vpord %zmm25, %zmm16, %zmm12 {%k1} -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm7, %ymm29, %ymm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 -; AVX512-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm13 & (zmm16 ^ zmm7)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 -; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512-NEXT: vpandnq %zmm7, %zmm3, %zmm7 -; AVX512-NEXT: vpandq %zmm3, %zmm6, %zmm6 -; AVX512-NEXT: vpord %zmm7, %zmm6, %zmm16 {%k1} -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm13 & (zmm2 ^ zmm0)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512-NEXT: # ymm26 = mem[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm25 +; AVX512-NEXT: vpandnq %zmm14, %zmm7, %zmm14 +; AVX512-NEXT: vpandq %zmm7, %zmm25, %zmm25 +; AVX512-NEXT: vpord %zmm14, %zmm25, %zmm0 {%k1} +; AVX512-NEXT: vpshufd $212, (%rsp), %ymm14 # 32-byte Folded Reload +; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512-NEXT: vpshufd {{.*#+}} ymm21 = ymm27[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm17 +; AVX512-NEXT: vpandnq %zmm14, %zmm7, %zmm14 +; AVX512-NEXT: vpandq %zmm7, %zmm17, %zmm17 +; AVX512-NEXT: vpord %zmm14, %zmm17, %zmm8 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vpandnq %zmm9, %zmm7, %zmm3 +; AVX512-NEXT: vpandq %zmm7, %zmm2, %zmm2 +; AVX512-NEXT: vpord %zmm3, %zmm2, %zmm6 {%k1} +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpandnq %zmm2, %zmm7, %zmm2 +; AVX512-NEXT: vpandq %zmm7, %zmm1, %zmm1 +; AVX512-NEXT: vpord %zmm2, %zmm1, %zmm12 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -7766,147 +7400,146 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-LABEL: store_i8_stride8_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX512DQ-NEXT: subq $552, %rsp # imm = 0x228 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%r10), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%rax), %xmm5 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%r9), %xmm7 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 48(%r8), %xmm12 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm9 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm11 -; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm2 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm13, %ymm31 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm28 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm23 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm4, %ymm21 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm4, %ymm20 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm30 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm29 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm5, %ymm24 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm5, %ymm22 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm7[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm13, %ymm12, %ymm19 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm12, %ymm18 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm7, %ymm4 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm10 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm5 +; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 48(%r10), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 48(%rax), %xmm4 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 48(%r9), %xmm6 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 48(%r8), %xmm8 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm9, %ymm26 +; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm28 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm0 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm9, %ymm29 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm25 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm23 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm19 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm17 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm4, %ymm3, %ymm16 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm14 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm3, %ymm20 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm3, %ymm18 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm15 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm13 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm12[8],xmm5[9],xmm12[9],xmm5[10],xmm12[10],xmm5[11],xmm12[11],xmm5[12],xmm12[12],xmm5[13],xmm12[13],xmm5[14],xmm12[14],xmm5[15],xmm12[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 @@ -7915,334 +7548,245 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm14 -; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm12 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm4 +; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm1 -; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512DQ-NEXT: vmovdqa 16(%r10), %xmm9 +; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm8 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm1, %ymm30 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm15 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm27 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm2 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm10, %ymm3 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = mem[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm10 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm13 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm13 & (zmm10 ^ zmm3)) -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] -; AVX512DQ-NEXT: vpandnq %zmm4, %zmm3, %zmm4 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm5, %zmm5 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm31 +; AVX512DQ-NEXT: vmovdqa 16(%r9), %xmm6 +; AVX512DQ-NEXT: vmovdqa 16(%r8), %xmm5 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm11, %ymm1, %ymm27 +; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm11 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm21 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm22 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero,xmm11[2],zero,zero,zero,xmm11[3],zero,zero,zero,xmm11[4],zero,zero,zero,xmm11[5],zero,zero,zero,xmm11[6],zero,zero,zero,xmm11[7],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm10 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0] +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm10, %zmm10 ; AVX512DQ-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vpord %zmm4, %zmm5, %zmm10 {%k1} -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm13 & (zmm9 ^ zmm0)) +; AVX512DQ-NEXT: vpord %zmm0, %zmm10, %zmm22 {%k1} +; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm2 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm2 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, (%rsp), %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpord %zmm0, %zmm2, %zmm9 {%k1} -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm13 & (zmm11 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm26 = ymm26[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpord %zmm0, %zmm10, %zmm24 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm11 {%k1} -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm7, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm10 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm17 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm28 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm6, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm13 & (zmm4 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm24[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpord %zmm0, %zmm6, %zmm4 {%k1} -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm19 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm14 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpandnq %zmm1, %zmm7, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpord %zmm1, %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm18 -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm2, %ymm21 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm2, %ymm22 -; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm2, %ymm26 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm6, %ymm7, %ymm19 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm14 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm15 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm7 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm8, %ymm17 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,4,6,5] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm8, %ymm20 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm8 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm1, %ymm20 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm8, %ymm3, %ymm23 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm9 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm6, %ymm13 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm15 +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm5 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm6, %ymm16 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm1, %ymm18 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,1,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm7, %ymm1 -; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm7[8],xmm12[9],xmm7[9],xmm12[10],xmm7[10],xmm12[11],xmm7[11],xmm12[12],xmm7[12],xmm12[13],xmm7[13],xmm12[14],xmm7[14],xmm12[15],xmm7[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm5[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm23, %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm23 = mem[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12)) -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm12 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm23 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpandnq %zmm12, %zmm3, %zmm12 -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm23 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm24 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm23, %zmm23 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm23, %zmm23 -; AVX512DQ-NEXT: vpord %zmm12, %zmm23, %zmm5 {%k1} -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm0[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm12, %ymm12 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm24, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm23 = xmm7[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm24 = xmm7[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm30 = mem[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm13 & (zmm12 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm30 = xmm2[3,3,3,3] -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm31 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm6, %ymm1 +; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm6 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm14 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm14 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero +; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = mem[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm25 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm25 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm27 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 -; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm27 = mem[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm16 = mem[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512DQ-NEXT: vpandnq %zmm25, %zmm3, %zmm25 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpord %zmm25, %zmm16, %zmm12 {%k1} -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm23, %ymm16, %ymm16 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm7, %ymm29, %ymm7 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm16 = ymm28[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm13 & (zmm16 ^ zmm7)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm22[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm18 = ymm26[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 -; AVX512DQ-NEXT: vpandnq %zmm7, %zmm3, %zmm7 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpord %zmm7, %zmm6, %zmm16 {%k1} -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm30[0],zero,zero,zero,xmm30[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,1,3,3,6,5,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm13 & (zmm2 ^ zmm0)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm26 = mem[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm25 +; AVX512DQ-NEXT: vpandnq %zmm14, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm25, %zmm25 +; AVX512DQ-NEXT: vpord %zmm14, %zmm25, %zmm0 {%k1} +; AVX512DQ-NEXT: vpshufd $212, (%rsp), %ymm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm21 = ymm27[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm21, %zmm17 +; AVX512DQ-NEXT: vpandnq %zmm14, %zmm7, %zmm14 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm17, %zmm17 +; AVX512DQ-NEXT: vpord %zmm14, %zmm17, %zmm8 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpandnq %zmm9, %zmm7, %zmm3 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpord %zmm3, %zmm2, %zmm6 {%k1} +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm7, %zmm2 +; AVX512DQ-NEXT: vpandq %zmm7, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpord %zmm2, %zmm1, %zmm12 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQ-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQ-NEXT: addq $552, %rsp # imm = 0x228 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -8546,332 +8090,260 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa 16(%r10), %xmm12 ; AVX512BW-NEXT: vmovdqa64 32(%r10), %xmm16 ; AVX512BW-NEXT: vmovdqa 48(%r10), %xmm15 -; AVX512BW-NEXT: vmovdqa (%rax), %xmm2 +; AVX512BW-NEXT: vmovdqa (%rax), %xmm1 ; AVX512BW-NEXT: vmovdqa 16(%rax), %xmm13 ; AVX512BW-NEXT: vmovdqa64 32(%rax), %xmm17 ; AVX512BW-NEXT: vmovdqa64 48(%rax), %xmm18 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] -; AVX512BW-NEXT: vpermw %ymm1, %ymm3, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] +; AVX512BW-NEXT: vpermw %ymm3, %ymm2, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm6 +; AVX512BW-NEXT: vmovdqa (%r9), %xmm3 ; AVX512BW-NEXT: vmovdqa64 48(%r9), %xmm19 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm21 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] -; AVX512BW-NEXT: vpermw %ymm7, %ymm6, %ymm7 +; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] +; AVX512BW-NEXT: vpermw %ymm7, %ymm5, %ymm7 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm14 ; AVX512BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm24 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm27 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm21 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,6,6,7] ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm28 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm29 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm11, %ymm11 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] +; AVX512BW-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm22 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm24 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero ; AVX512BW-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm1 {%k2} +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm7 {%k2} ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k3} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm14, %ymm3, %ymm14 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm22, %ymm22 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm14[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm22 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23 -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1} +; AVX512BW-NEXT: vpermw %ymm14, %ymm2, %ymm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm22[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm22, %ymm5, %ymm22 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 +; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm25 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm23[0],xmm21[0],xmm23[1],xmm21[1],xmm23[2],xmm21[2],xmm23[3],xmm21[3],xmm23[4],xmm21[4],xmm23[5],xmm21[5],xmm23[6],xmm21[6],xmm23[7],xmm21[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm14[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm22, %ymm22 +; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm27 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm14, %ymm9, %ymm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm22 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm14, %ymm22, %ymm14 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm14 {%k2} ; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm22 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 -; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm25 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k3} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm14 {%k3} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm18, %ymm18 -; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm23 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm15[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm18, %ymm18 +; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm25 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm15, %ymm3, %ymm15 +; AVX512BW-NEXT: vpermw %ymm15, %ymm2, %ymm15 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm21 -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm19 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm18, %ymm6, %ymm18 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm18[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm20 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm18, %ymm5, %ymm18 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 ; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm26 ; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm23[8],xmm21[8],xmm23[9],xmm21[9],xmm23[10],xmm21[10],xmm23[11],xmm21[11],xmm23[12],xmm21[12],xmm23[13],xmm21[13],xmm23[14],xmm21[14],xmm23[15],xmm21[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm15[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm15[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm21 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm15, %ymm9, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm19 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm15, %ymm24, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm15, %zmm15 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm21 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm15 {%k2} +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero,xmm15[4],zero,zero,zero,xmm15[5],zero,zero,zero,xmm15[6],zero,zero,zero,xmm15[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm18[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm19, %ymm19 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm18, %ymm2, %ymm18 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm23, %ymm23 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm19, %ymm5, %ymm19 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm23 +; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm23 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm20[0],xmm26[1],xmm20[1],xmm26[2],xmm20[2],xmm26[3],xmm20[3],xmm26[4],xmm20[4],xmm26[5],xmm20[5],xmm26[6],xmm20[6],xmm26[7],xmm20[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm21, %ymm6, %ymm21 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm27 -; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm24, %ymm18 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm24, %ymm28 -; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm24 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm21 -; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm28 -; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm21 -; AVX512BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} +; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm19, %ymm19 +; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm27 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm18, %ymm9, %ymm18 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm19 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero,xmm18[4],zero,zero,zero,xmm18[5],zero,zero,zero,xmm18[6],zero,zero,zero,xmm18[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm18 {%k2} +; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm19 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k3} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm17, %ymm17 -; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm27 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm16[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm17, %ymm17 +; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm23 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm16, %ymm3, %ymm16 +; AVX512BW-NEXT: vpermw %ymm16, %ymm2, %ymm16 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm25 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm22[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm17, %ymm24 ; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm17 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm22, %ymm6, %ymm22 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 -; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm22 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm22, %ymm5, %ymm22 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm22, %zmm25 +; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm24 ; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm23, %ymm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm16 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm26[8],xmm20[8],xmm26[9],xmm20[9],xmm26[10],xmm20[10],xmm26[11],xmm20[11],xmm26[12],xmm20[12],xmm26[13],xmm20[13],xmm26[14],xmm20[14],xmm26[15],xmm20[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm16[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm16[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 +; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm22 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k2} +; AVX512BW-NEXT: vpermw %ymm16, %ymm9, %ymm16 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm19, %ymm3, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 -; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm25, %ymm19 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm19 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm20[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm21, %ymm21 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm20, %ymm2, %ymm20 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3],xmm23[4],xmm19[4],xmm23[5],xmm19[5],xmm23[6],xmm19[6],xmm23[7],xmm19[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm21[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm21[0,1,2,3,6,5,7,7] ; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm21, %ymm5, %ymm21 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm21, %zmm21 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm21 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm24[0],xmm17[0],xmm24[1],xmm17[1],xmm24[2],xmm17[2],xmm24[3],xmm17[3],xmm24[4],xmm17[4],xmm24[5],xmm17[5],xmm24[6],xmm17[6],xmm24[7],xmm17[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm20[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm26 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm23, %ymm11, %ymm23 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm23 -; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k3} +; AVX512BW-NEXT: vpermw %ymm20, %ymm9, %ymm20 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k3} ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm13, %ymm13 +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm12[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm13, %ymm13 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm12, %ymm3, %ymm12 +; AVX512BW-NEXT: vpermw %ymm12, %ymm2, %ymm12 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vpermw %ymm13, %ymm6, %ymm13 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm23[8],xmm19[8],xmm23[9],xmm19[9],xmm23[10],xmm19[10],xmm23[11],xmm19[11],xmm23[12],xmm19[12],xmm23[13],xmm19[13],xmm23[14],xmm19[14],xmm23[15],xmm19[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm13[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm13[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vpermw %ymm13, %ymm5, %ymm13 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm13, %zmm13 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512BW-NEXT: vinserti32x4 $1, %xmm12, %ymm21, %ymm12 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vpermw %ymm17, %ymm11, %ymm17 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2 -; AVX512BW-NEXT: vpermw %ymm0, %ymm3, %ymm0 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm24[8],xmm17[8],xmm24[9],xmm17[9],xmm24[10],xmm17[10],xmm24[11],xmm17[11],xmm24[12],xmm17[12],xmm24[13],xmm17[13],xmm24[14],xmm17[14],xmm24[15],xmm17[15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm12[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm12[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm17, %ymm17 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,1,3,3,6,5,7,7] +; AVX512BW-NEXT: vpermw %ymm12, %ymm9, %ymm12 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm12, %zmm12 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm17 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm17 {%k3} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 +; AVX512BW-NEXT: vpermw %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,7,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpermw %ymm1, %ymm5, %ymm1 ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpermw %ymm0, %ymm9, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vpermw %ymm2, %ymm6, %ymm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512BW-NEXT: vpermw %ymm3, %ymm11, %ymm3 -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -9056,332 +8528,260 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa 16(%r10), %xmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %xmm16 ; AVX512DQ-BW-NEXT: vmovdqa 48(%r10), %xmm15 -; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rax), %xmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %xmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%rax), %xmm18 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm3, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3] +; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm2, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 48(%r9), %xmm19 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm21 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm7[0,1,2,3,4,5,5,7] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] -; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm6, %ymm7 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3] +; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm5, %ymm7 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm14 ; AVX512DQ-BW-NEXT: movl $-2004318072, %eax # imm = 0x88888888 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm24 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm27 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm21 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm28 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm29 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm20[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm11, %ymm11 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm11[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm24 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero ; AVX512DQ-BW-NEXT: movl $572662306, %eax # imm = 0x22222222 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm1 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm1 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm14, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm14[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm3, %ymm14 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm22, %ymm22 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm14[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm22 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm2, %ymm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm22[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm22, %ymm5, %ymm22 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm25 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm23[0],xmm21[0],xmm23[1],xmm21[1],xmm23[2],xmm21[2],xmm23[3],xmm21[3],xmm23[4],xmm21[4],xmm23[5],xmm21[5],xmm23[6],xmm21[6],xmm23[7],xmm21[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm14[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm14[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm22, %ymm22 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm22 = ymm22[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm14, %ymm9, %ymm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm22, %zmm14, %zmm22 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm27[0],xmm24[0],xmm27[1],xmm24[1],xmm27[2],xmm24[2],xmm27[3],xmm24[3],xmm27[4],xmm24[4],xmm27[5],xmm24[5],xmm27[6],xmm24[6],xmm27[7],xmm24[7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm14[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm22 = xmm14[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm22 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm14, %ymm22, %ymm14 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm14, %zmm14 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm20[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm22, %ymm25 +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm22 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm11, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm25 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm14 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm14 {%k3} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm18 = xmm15[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm15[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm18, %ymm18 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm23 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm15[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm18, %ymm18 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm25 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm18 = ymm18[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm15, %ymm3, %ymm15 +; AVX512DQ-BW-NEXT: vpermw %ymm15, %ymm2, %ymm15 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm15, %zmm15 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm19 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm6, %ymm18 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm18[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm20 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm5, %ymm18 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm26 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm23[8],xmm21[8],xmm23[9],xmm21[9],xmm23[10],xmm21[10],xmm23[11],xmm21[11],xmm23[12],xmm21[12],xmm23[13],xmm21[13],xmm23[14],xmm21[14],xmm23[15],xmm21[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm15[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm15[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm21 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm15, %ymm9, %ymm15 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm19 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm27[8],xmm24[8],xmm27[9],xmm24[9],xmm27[10],xmm24[10],xmm27[11],xmm24[11],xmm27[12],xmm24[12],xmm27[13],xmm24[13],xmm27[14],xmm24[14],xmm27[15],xmm24[15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm15[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm15, %ymm24, %ymm15 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm15, %zmm15 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero,xmm15[4],zero,zero,zero,xmm15[5],zero,zero,zero,xmm15[6],zero,zero,zero,xmm15[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm15 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm15 {%k3} ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm18[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm18[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm2, %ymm18 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm18 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm19[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm23, %ymm23 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm5, %ymm19 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm23 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm20[0],xmm26[1],xmm20[1],xmm26[2],xmm20[2],xmm26[3],xmm20[3],xmm26[4],xmm20[4],xmm26[5],xmm20[5],xmm26[6],xmm20[6],xmm26[7],xmm20[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm18[0,1,2,3,4,4,6,5] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm18[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm3, %ymm18 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm25[0],xmm22[0],xmm25[1],xmm22[1],xmm25[2],xmm22[2],xmm25[3],xmm22[3],xmm25[4],xmm22[4],xmm25[5],xmm22[5],xmm25[6],xmm22[6],xmm25[7],xmm22[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm21[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm24, %ymm24 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm6, %ymm21 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm27 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm27 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm23[0],xmm20[0],xmm23[1],xmm20[1],xmm23[2],xmm20[2],xmm23[3],xmm20[3],xmm23[4],xmm20[4],xmm23[5],xmm20[5],xmm23[6],xmm20[6],xmm23[7],xmm20[7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm18[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm18[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm24[0],zero,zero,zero,xmm24[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm21, %ymm21 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm18 = xmm18[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm24, %ymm18 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm26[0],xmm19[0],xmm26[1],xmm19[1],xmm26[2],xmm19[2],xmm26[3],xmm19[3],xmm26[4],xmm19[4],xmm26[5],xmm19[5],xmm26[6],xmm19[6],xmm26[7],xmm19[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm21[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm28 = xmm21[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm24, %ymm28 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm24 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm28 = ymm28[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm11, %ymm21 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm28 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm21 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm27, %zmm18 {%k3} +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm18, %ymm9, %ymm18 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm19 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero,xmm18[2],zero,zero,zero,xmm18[3],zero,zero,zero,xmm18[4],zero,zero,zero,xmm18[5],zero,zero,zero,xmm18[6],zero,zero,zero,xmm18[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm18 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm19 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k3} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm16[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm27 = xmm16[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm16[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm17, %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm23 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm3, %ymm16 +; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm2, %ymm16 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm25[8],xmm22[8],xmm25[9],xmm22[9],xmm25[10],xmm22[10],xmm25[11],xmm22[11],xmm25[12],xmm22[12],xmm25[13],xmm22[13],xmm25[14],xmm22[14],xmm25[15],xmm22[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm22[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm22[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm17, %ymm25 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm24 = xmm22[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm17, %ymm24 ; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm17 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm22, %ymm6, %ymm22 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm22, %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm22 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm24 = ymm24[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm22, %ymm5, %ymm22 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm22, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm24 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm25 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8],xmm20[8],xmm23[9],xmm20[9],xmm23[10],xmm20[10],xmm23[11],xmm20[11],xmm23[12],xmm20[12],xmm23[13],xmm20[13],xmm23[14],xmm20[14],xmm23[15],xmm20[15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm16[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm16[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm16 = xmm16[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm16, %ymm23, %ymm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm16 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm19[8],xmm26[9],xmm19[9],xmm26[10],xmm19[10],xmm26[11],xmm19[11],xmm26[12],xmm19[12],xmm26[13],xmm19[13],xmm26[14],xmm19[14],xmm26[15],xmm19[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm26[8],xmm20[8],xmm26[9],xmm20[9],xmm26[10],xmm20[10],xmm26[11],xmm20[11],xmm26[12],xmm20[12],xmm26[13],xmm20[13],xmm26[14],xmm20[14],xmm26[15],xmm20[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm16[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm22 = xmm16[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm20, %ymm20 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm22 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm11, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm9, %ymm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm19[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm19[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm19, %ymm3, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm28[0],xmm24[0],xmm28[1],xmm24[1],xmm28[2],xmm24[2],xmm28[3],xmm24[3],xmm28[4],xmm24[4],xmm28[5],xmm24[5],xmm28[6],xmm24[6],xmm28[7],xmm24[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm20[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm6, %ymm20 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm20, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3],xmm27[4],xmm21[4],xmm27[5],xmm21[5],xmm27[6],xmm21[6],xmm27[7],xmm21[7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm23 = xmm19[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm19[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm25[0],zero,zero,zero,xmm25[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm23, %ymm23 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm19[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm25, %ymm19 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm23, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm17[0],xmm22[1],xmm17[1],xmm22[2],xmm17[2],xmm22[3],xmm17[3],xmm22[4],xmm17[4],xmm22[5],xmm17[5],xmm22[6],xmm17[6],xmm22[7],xmm17[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm23[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm23[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm20[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm25, %ymm21, %ymm21 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm21 = ymm21[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm2, %ymm20 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm20 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3],xmm23[4],xmm19[4],xmm23[5],xmm19[5],xmm23[6],xmm19[6],xmm23[7],xmm19[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm21[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm21[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm21, %ymm5, %ymm21 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm21, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm21 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm24[0],xmm17[0],xmm24[1],xmm17[1],xmm24[2],xmm17[2],xmm24[3],xmm17[3],xmm24[4],xmm17[4],xmm24[5],xmm17[5],xmm24[6],xmm17[6],xmm24[7],xmm17[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm25 = xmm20[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm26 = xmm20[0,1,2,3,4,6,6,7] ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm26, %ymm25, %ymm25 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm26 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm23, %ymm11, %ymm23 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm19 {%k3} +; AVX512DQ-BW-NEXT: vpermw %ymm20, %ymm9, %ymm20 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm20 {%k3} ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm12[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm13, %ymm13 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm12[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm13, %ymm13 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm3, %ymm12 +; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm2, %ymm12 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm28[8],xmm24[8],xmm28[9],xmm24[9],xmm28[10],xmm24[10],xmm28[11],xmm24[11],xmm28[12],xmm24[12],xmm28[13],xmm24[13],xmm28[14],xmm24[14],xmm28[15],xmm24[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm13[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm23 = xmm13[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm23, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm6, %ymm13 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm13, %zmm13 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm23[8],xmm19[8],xmm23[9],xmm19[9],xmm23[10],xmm19[10],xmm23[11],xmm19[11],xmm23[12],xmm19[12],xmm23[13],xmm19[13],xmm23[14],xmm19[14],xmm23[15],xmm19[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm13[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm13[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm19, %ymm19 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm19 = ymm19[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm5, %ymm13 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm13, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm13 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm27[8],xmm21[8],xmm27[9],xmm21[9],xmm27[10],xmm21[10],xmm27[11],xmm21[11],xmm27[12],xmm21[12],xmm27[13],xmm21[13],xmm27[14],xmm21[14],xmm27[15],xmm21[15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm20 = xmm12[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm12[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm21 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm12, %ymm21, %ymm12 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm22[8],xmm17[8],xmm22[9],xmm17[9],xmm22[10],xmm17[10],xmm22[11],xmm17[11],xmm22[12],xmm17[12],xmm22[13],xmm17[13],xmm22[14],xmm17[14],xmm22[15],xmm17[15] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm20 = xmm17[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm21 = xmm17[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm20, %ymm20 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %ymm17, %ymm11, %ymm17 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm12 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm12 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm3, %ymm0 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm24[8],xmm17[8],xmm24[9],xmm17[9],xmm24[10],xmm17[10],xmm24[11],xmm17[11],xmm24[12],xmm17[12],xmm24[13],xmm17[13],xmm24[14],xmm17[14],xmm24[15],xmm17[15] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm17 = xmm12[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm19 = xmm12[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm17, %ymm17 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,1,3,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm9, %ymm12 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm12, %zmm12 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm22[8],xmm26[9],xmm22[9],xmm26[10],xmm22[10],xmm26[11],xmm22[11],xmm26[12],xmm22[12],xmm26[13],xmm22[13],xmm26[14],xmm22[14],xmm26[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm17 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm17 {%k3} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm2, %ymm0 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,7,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm5, %ymm1 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm9, %ymm0 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm6, %ymm2 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,6,6,7] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4 -; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm11, %ymm3 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 448(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index 541dfb54e96d2..cca9d4aa2a9f0 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -241,81 +241,77 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32 ; ; AVX2-LABEL: widen_ctpop_v2i32_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm5 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm7 +; AVX2-NEXT: vpand %xmm5, %xmm7, %xmm7 +; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm8 +; AVX2-NEXT: vpand %xmm5, %xmm8, %xmm8 +; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm5 -; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm0, %xmm4, %xmm0 +; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm2 +; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-NEXT: vpaddb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: widen_ctpop_v2i32_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm5 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %xmm5, %xmm4, %xmm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vpaddb %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VL-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX512VL-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm7 +; AVX512VL-NEXT: vpand %xmm5, %xmm7, %xmm7 +; AVX512VL-NEXT: vpshufb %xmm7, %xmm6, %xmm7 +; AVX512VL-NEXT: vpand %xmm5, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512VL-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512VL-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512VL-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpand %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsrlw $4, %xmm2, %xmm8 +; AVX512VL-NEXT: vpand %xmm5, %xmm8, %xmm8 +; AVX512VL-NEXT: vpshufb %xmm8, %xmm6, %xmm8 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX512VL-NEXT: vpaddb %xmm0, %xmm4, %xmm0 +; AVX512VL-NEXT: vpsrlw $4, %xmm3, %xmm2 +; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512VL-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm5 -; AVX512VL-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX512VL-NEXT: vpsrlw $4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm2[0] +; AVX512VL-NEXT: vpand %xmm5, %xmm3, %xmm3 ; AVX512VL-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512VL-NEXT: vpaddb %xmm5, %xmm3, %xmm3 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1304,47 +1300,45 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm5 ; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm6 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm5 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpaddb %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm6 -; AVX2-NEXT: vpandn %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm8 +; AVX2-NEXT: vpandn %xmm8, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm8 +; AVX2-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vpaddb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpandn %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm6 -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm9 +; AVX2-NEXT: vpandn %xmm9, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm9 +; AVX2-NEXT: vpand %xmm6, %xmm9, %xmm9 +; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpaddb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm4 -; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm4 -; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3 +; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpaddb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm8[0],xmm3[0] +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1649,47 +1643,45 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm5 ; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm6 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm5 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpaddb %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm6 -; AVX2-NEXT: vpandn %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm6 -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm8 +; AVX2-NEXT: vpandn %xmm8, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm8 +; AVX2-NEXT: vpand %xmm6, %xmm8, %xmm8 +; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm8 +; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vpaddb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm6 -; AVX2-NEXT: vpandn %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm6 -; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm9 +; AVX2-NEXT: vpandn %xmm9, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm9 +; AVX2-NEXT: vpand %xmm6, %xmm9, %xmm9 +; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm9 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpaddb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm4 -; AVX2-NEXT: vpandn %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm4 -; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm0, %xmm5, %xmm0 +; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3 +; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpaddb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm8[0],xmm3[0] +; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/TableGen/PerWriteCycleCount.td b/llvm/test/TableGen/PerWriteCycleCount.td new file mode 100644 index 0000000000000..ac60d8c438834 --- /dev/null +++ b/llvm/test/TableGen/PerWriteCycleCount.td @@ -0,0 +1,48 @@ +// RUN: llvm-tblgen -gen-subtarget -I %p/../../include %s 2>&1 | FileCheck %s +// RUN: not llvm-tblgen -gen-subtarget -I %p/../../include -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s + +// Make sure that ReadAdvance entries with multiple writes are correctly +// handled. + +include "llvm/Target/Target.td" + +def MyTarget : Target; + +let OutOperandList = (outs), InOperandList = (ins) in { + def Inst_A : Instruction; + def Inst_B : Instruction; + def Inst_C : Instruction; +} + +let CompleteModel = 0 in { + def SchedModel_A: SchedMachineModel; +} + +def Read_D : SchedRead; +def Read_E : SchedRead; + +// CHECK: extern const llvm::MCReadAdvanceEntry MyTargetReadAdvanceTable[] = { +// CHECK-NEXT: {0, 0, 0}, // Invalid +// CHECK-NEXT: {0, 1, 1}, // #1 +// CHECK-NEXT: {0, 2, 3}, // #2 +// CHECK-NEXT: {0, 3, 2} // #3 +// CHECK-NEXT: }; // MyTargetReadAdvanceTable + +let SchedModel = SchedModel_A in { + def Write_A : SchedWriteRes<[]>; + def Write_B : SchedWriteRes<[]>; + def Write_C : SchedWriteRes<[]>; + + def : InstRW<[Write_A], (instrs Inst_A)>; + def : InstRW<[Write_B], (instrs Inst_B)>; + def : InstRW<[Write_C, Read_D], (instrs Inst_C)>; + + def : ReadAdvance; + +#ifdef ERROR1 +// ERROR1: error: assertion failed: cannot have more `tunables' than `writes' + def : ReadAdvance; +#endif +} + +def ProcessorA: ProcessorModel<"ProcessorA", SchedModel_A, []>; diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll index 4e7e9eeb7250b..46ca99f4bb27b 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll @@ -240,6 +240,23 @@ define i64 @cntd_all() { } +define i64 @udiv() vscale_range(1, 16) { +; CHECK-LABEL: @udiv( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[A:%.*]] = shl nuw nsw i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[B:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call range(i64 2, 65) i64 @llvm.cttz.i64(i64 [[B]], i1 true) +; CHECK-NEXT: [[C1:%.*]] = lshr i64 [[A]], [[TMP3]] +; CHECK-NEXT: ret i64 [[C1]] +; + %a = call i64 @llvm.aarch64.sve.cntb(i32 31) + %b = call i64 @llvm.aarch64.sve.cntw(i32 31) + %c = udiv i64 %a, %b + ret i64 %c +} + + declare i64 @llvm.aarch64.sve.cntb(i32 %pattern) declare i64 @llvm.aarch64.sve.cnth(i32 %pattern) declare i64 @llvm.aarch64.sve.cntw(i32 %pattern) diff --git a/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll index 63caec9501325..e82e33e9d7f04 100644 --- a/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll +++ b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll @@ -103,4 +103,34 @@ entry: ret i32 %res } +define i64 @fold_cttz_64() vscale_range(1,16) { +; CHECK-LABEL: define i64 @fold_cttz_64( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i64 4 +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %shl0 = shl nuw nsw i64 %vscale, 4 + %shl1 = shl nuw nsw i64 %vscale, 2 + %cttz = tail call range(i64 2, 65) i64 @llvm.cttz.i64(i64 %shl1, i1 true) + %div1 = lshr i64 %shl0, %cttz + ret i64 %div1 +} + +define i32 @fold_cttz_32() vscale_range(1,16) { +; CHECK-LABEL: define i32 @fold_cttz_32( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 4 +; +entry: + %vscale = tail call i32 @llvm.vscale.i32() + %shl0 = shl nuw nsw i32 %vscale, 4 + %shl1 = shl nuw nsw i32 %vscale, 2 + %cttz = tail call range(i32 2, 65) i32 @llvm.cttz.i32(i32 %shl1, i1 true) + %div1 = lshr i32 %shl0, %cttz + ret i32 %div1 +} + declare void @use(i32) diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll index 3f81c0f5c822a..c5d2f6acf85b3 100644 --- a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll @@ -134,7 +134,7 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) ; CHECK-NEXT: br label %[[INNER_LATCH4]] ; CHECK: [[INNER_LATCH4]]: -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ zeroinitializer, %[[INNER_HEADER1]] ], [ [[WIDE_MASKED_GATHER]], %[[THEN3]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[THEN3]] ], [ zeroinitializer, %[[INNER_HEADER1]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i64> [[VEC_PHI5]], [[VEC_IND]] ; CHECK-NEXT: [[TMP3]] = add nsw <4 x i64> [[TMP2]], [[VEC_PHI2]] ; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1) diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/vscale.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/vscale.ll new file mode 100644 index 0000000000000..7aa50ddf61468 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/vscale.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="default" -mattr=+sve -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +define i64 @udiv() vscale_range(1, 16) { +; CHECK-LABEL: @udiv( +; CHECK-NEXT: ret i64 4 +; + %a = call i64 @llvm.aarch64.sve.cntb(i32 31) + %b = call i64 @llvm.aarch64.sve.cntw(i32 31) + %c = udiv i64 %a, %b + ret i64 %c +} diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp index 1bafb52d357fa..bcb5d498c8cb9 100644 --- a/llvm/unittests/IR/ConstantRangeTest.cpp +++ b/llvm/unittests/IR/ConstantRangeTest.cpp @@ -1662,6 +1662,17 @@ TEST(ConstantRange, MakeAllowedICmpRegionEdgeCases) { .isFullSet()); } +TEST(ConstantRange, MakeExactICmpRegion) { + for (unsigned Bits : {1, 4}) { + EnumerateAPInts(Bits, [](const APInt &N) { + for (auto Pred : ICmpInst::predicates()) { + EXPECT_EQ(ConstantRange::makeAllowedICmpRegion(Pred, N), + ConstantRange::makeSatisfyingICmpRegion(Pred, N)); + }; + }); + } +} + TEST(ConstantRange, MakeSatisfyingICmpRegion) { ConstantRange LowHalf(APInt(8, 0), APInt(8, 128)); ConstantRange HighHalf(APInt(8, 128), APInt(8, 0)); diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp index d0a3cfa84ee01..8fbd470815b79 100644 --- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp +++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp @@ -164,3 +164,83 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) { testGPRLimits("VGPR", true, test); } + +static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) { + return SubReg ? TRI.getSubRegIndexName(SubReg) : ""; +} + +TEST(AMDGPU, TestReverseComposeSubRegIndices) { + auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx900", ""); + if (!TM) + return; + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + +#define EXPECT_SUBREG_EQ(A, B, Expect) \ + do { \ + unsigned Reversed = TRI->reverseComposeSubRegIndices(A, B); \ + EXPECT_EQ(Reversed, Expect) \ + << printSubReg(*TRI, A) << ", " << printSubReg(*TRI, B) << " => " \ + << printSubReg(*TRI, Reversed) << ", *" << printSubReg(*TRI, Expect); \ + } while (0); + + EXPECT_SUBREG_EQ(AMDGPU::NoSubRegister, AMDGPU::sub0, AMDGPU::sub0); + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::NoSubRegister, AMDGPU::sub0); + + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub0, AMDGPU::sub0); + + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub1); + EXPECT_SUBREG_EQ(AMDGPU::sub1, AMDGPU::sub0, AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1, AMDGPU::sub0, AMDGPU::sub0); + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub0_sub1, + AMDGPU::sub0_sub1); + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::sub0_sub1_sub2_sub3); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2, + AMDGPU::sub1_sub2); + EXPECT_SUBREG_EQ(AMDGPU::sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0_sub1_sub2_sub3, + AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3, + AMDGPU::sub1_sub2_sub3); + + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub30, AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub30, AMDGPU::sub0, AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub0, AMDGPU::sub31, AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub31, AMDGPU::sub0, AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1, AMDGPU::sub30, AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub30, AMDGPU::sub0_sub1, AMDGPU::NoSubRegister); + + EXPECT_SUBREG_EQ(AMDGPU::sub0_sub1, AMDGPU::sub30_sub31, + AMDGPU::NoSubRegister); + EXPECT_SUBREG_EQ(AMDGPU::sub30_sub31, AMDGPU::sub0_sub1, + AMDGPU::NoSubRegister); + + for (unsigned SubIdx0 = 1, LastSubReg = TRI->getNumSubRegIndices(); + SubIdx0 != LastSubReg; ++SubIdx0) { + for (unsigned SubIdx1 = 1; SubIdx1 != LastSubReg; ++SubIdx1) { + if (unsigned ForwardCompose = + TRI->composeSubRegIndices(SubIdx0, SubIdx1)) { + unsigned ReverseComposed = + TRI->reverseComposeSubRegIndices(SubIdx0, ForwardCompose); + EXPECT_EQ(ReverseComposed, SubIdx1); + } + + if (unsigned ReverseCompose = + TRI->reverseComposeSubRegIndices(SubIdx0, SubIdx1)) { + unsigned Recompose = TRI->composeSubRegIndices(SubIdx0, ReverseCompose); + EXPECT_EQ(Recompose, SubIdx1); + } + } + } +} diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 7fee62721e6e0..93ac7381b02ef 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1130,14 +1130,20 @@ INSTANTIATE_TEST_SUITE_P( AArch64CPUTestParams("apple-s4", "armv8.3-a"), AArch64CPUTestParams("apple-s5", "armv8.3-a"), AArch64CPUTestParams("apple-a13", "armv8.4-a"), + AArch64CPUTestParams("apple-s6", "armv8.4-a"), + AArch64CPUTestParams("apple-s7", "armv8.4-a"), + AArch64CPUTestParams("apple-s8", "armv8.4-a"), AArch64CPUTestParams("apple-a14", "armv8.4-a"), AArch64CPUTestParams("apple-m1", "armv8.4-a"), AArch64CPUTestParams("apple-a15", "armv8.6-a"), AArch64CPUTestParams("apple-m2", "armv8.6-a"), AArch64CPUTestParams("apple-a16", "armv8.6-a"), AArch64CPUTestParams("apple-m3", "armv8.6-a"), + AArch64CPUTestParams("apple-s9", "armv8.6-a"), + AArch64CPUTestParams("apple-s10", "armv8.6-a"), AArch64CPUTestParams("apple-a17", "armv8.6-a"), AArch64CPUTestParams("apple-m4", "armv8.7-a"), + AArch64CPUTestParams("apple-a18", "armv8.7-a"), AArch64CPUTestParams("exynos-m3", "armv8-a"), AArch64CPUTestParams("exynos-m4", "armv8.2-a"), AArch64CPUTestParams("exynos-m5", "armv8.2-a"), @@ -1246,13 +1252,17 @@ INSTANTIATE_TEST_SUITE_P( "apple-a8", "apple-a9"}), AArch64CPUAliasTestParams({"apple-a12", "apple-s4", "apple-s5"}), + AArch64CPUAliasTestParams({"apple-a13", "apple-s6", + "apple-s7", "apple-s8"}), AArch64CPUAliasTestParams({"apple-a14", "apple-m1"}), AArch64CPUAliasTestParams({"apple-a15", "apple-m2"}), - AArch64CPUAliasTestParams({"apple-a16", "apple-m3"})), + AArch64CPUAliasTestParams({"apple-a16", "apple-m3", + "apple-s9", "apple-s10"}), + AArch64CPUAliasTestParams({"apple-m4", "apple-a18"})), AArch64CPUAliasTestParams::PrintToStringParamName); // Note: number of CPUs includes aliases. -static constexpr unsigned NumAArch64CPUArchs = 82; +static constexpr unsigned NumAArch64CPUArchs = 88; TEST(TargetParserTest, testAArch64CPUArchList) { SmallVector List; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 2f37c08bd9f11..5f73aa43daef9 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -672,7 +672,7 @@ TEST_F(VPBasicBlockTest, reassociateBlocks) { auto *WidenPhi = new VPWidenPHIRecipe(nullptr); IntegerType *Int32 = IntegerType::get(C, 32); VPValue *Val = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); - WidenPhi->addIncoming(Val, VPBB1); + WidenPhi->addOperand(Val); VPBB2->appendRecipe(WidenPhi); VPBasicBlock *VPBBNew = Plan.createVPBasicBlock("VPBBNew"); @@ -693,7 +693,8 @@ TEST_F(VPBasicBlockTest, reassociateBlocks) { auto *WidenPhi = new VPWidenPHIRecipe(nullptr); IntegerType *Int32 = IntegerType::get(C, 32); VPValue *Val = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); - WidenPhi->addIncoming(Val, VPBB1); + WidenPhi->addOperand(Val); + WidenPhi->addOperand(Val); VPBB2->appendRecipe(WidenPhi); VPBasicBlock *VPBBNew = Plan.createVPBasicBlock("VPBBNew"); diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 2f9ec2e6e7a22..752ebdf01b948 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -680,8 +680,6 @@ static bool combine(const CodeGenSubRegIndex *Idx, void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, StringRef ClassName) { const auto &SubRegIndices = RegBank.getSubRegIndices(); - OS << "unsigned " << ClassName - << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n"; // Many sub-register indexes are composition-compatible, meaning that // @@ -713,7 +711,10 @@ void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, RowMap.push_back(Found); } - // Output the row map if there is multiple rows. + OS << "unsigned " << ClassName + << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n"; + + // Output the row map if there are multiple rows. if (Rows.size() > 1) { OS << " static const " << getMinimalTypeForRange(Rows.size(), 32) << " RowMap[" << SubRegIndicesSize << "] = {\n "; @@ -743,6 +744,51 @@ void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS, else OS << " return Rows[0][IdxB];\n"; OS << "}\n\n"; + + // Generate the reverse case. + // + // FIXME: This is the brute force approach. Compress the table similar to the + // forward case. + OS << "unsigned " << ClassName + << "::reverseComposeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const " + "{\n"; + OS << " static const " << getMinimalTypeForRange(SubRegIndicesSize + 1, 32) + << " Table[" << SubRegIndicesSize << "][" << SubRegIndicesSize + << "] = {\n"; + + // Find values where composeSubReg(A, X) == B; + for (const auto &IdxA : SubRegIndices) { + OS << " { "; + + SmallVectorImpl &Row = + Rows[RowMap[IdxA.EnumValue - 1]]; + for (const auto &IdxB : SubRegIndices) { + const CodeGenSubRegIndex *FoundReverse = nullptr; + + for (unsigned i = 0, e = SubRegIndicesSize; i != e; ++i) { + const CodeGenSubRegIndex *This = &SubRegIndices[i]; + const CodeGenSubRegIndex *Composed = Row[i]; + if (Composed == &IdxB) { + if (FoundReverse && FoundReverse != This) // Not unique + break; + FoundReverse = This; + } + } + + if (FoundReverse) { + OS << FoundReverse->getQualifiedName() << ", "; + } else { + OS << "0, "; + } + } + OS << "},\n"; + } + + OS << " };\n\n"; + OS << " --IdxA; assert(IdxA < " << SubRegIndicesSize << ");\n" + << " --IdxB; assert(IdxB < " << SubRegIndicesSize << ");\n"; + OS << " return Table[IdxA][IdxB];\n"; + OS << " }\n\n"; } void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS, @@ -1113,6 +1159,8 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) { << " unsigned PC = 0, unsigned HwMode = 0);\n"; if (!RegBank.getSubRegIndices().empty()) { OS << " unsigned composeSubRegIndicesImpl" + << "(unsigned, unsigned) const override;\n" + << " unsigned reverseComposeSubRegIndicesImpl" << "(unsigned, unsigned) const override;\n" << " LaneBitmask composeSubRegIndexLaneMaskImpl" << "(unsigned, LaneBitmask) const override;\n" diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp index 49362ff5ef655..aec05f1ae7742 100644 --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -1308,23 +1308,27 @@ void SubtargetEmitter::genSchedClassTables(const CodeGenProcModel &ProcModel, } ConstRecVec ValidWrites = ReadAdvance->getValueAsListOfDefs("ValidWrites"); - IdxVec WriteIDs; + std::vector CycleTunables = + ReadAdvance->getValueAsListOfInts("CycleTunables"); + std::vector> WriteIDs; + assert(CycleTunables.size() <= ValidWrites.size() && "Bad ReadAdvance"); + CycleTunables.resize(ValidWrites.size(), 0); if (ValidWrites.empty()) - WriteIDs.push_back(0); + WriteIDs.emplace_back(0, 0); else { - for (const Record *VW : ValidWrites) { + for (const auto [VW, CT] : zip_equal(ValidWrites, CycleTunables)) { unsigned WriteID = SchedModels.getSchedRWIdx(VW, /*IsRead=*/false); assert(WriteID != 0 && "Expected a valid SchedRW in the list of ValidWrites"); - WriteIDs.push_back(WriteID); + WriteIDs.emplace_back(WriteID, CT); } } llvm::sort(WriteIDs); - for (unsigned W : WriteIDs) { + for (const auto &[W, T] : WriteIDs) { MCReadAdvanceEntry RAEntry; RAEntry.UseIdx = UseIdx; RAEntry.WriteResourceID = W; - RAEntry.Cycles = ReadAdvance->getValueAsInt("Cycles"); + RAEntry.Cycles = ReadAdvance->getValueAsInt("Cycles") + T; ReadAdvanceEntries.push_back(RAEntry); } } diff --git a/llvm/utils/gn/secondary/bolt/include/bolt/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/include/bolt/Core/BUILD.gn new file mode 100644 index 0000000000000..2d7c1a70abe95 --- /dev/null +++ b/llvm/utils/gn/secondary/bolt/include/bolt/Core/BUILD.gn @@ -0,0 +1,6 @@ +import("//llvm/lib/Target/write_target_def_file.gni") + +write_target_def_file("TargetConfig.def") { + key = "BOLT_ENUM_TARGETS" + value = "BOLT_TARGET" +} diff --git a/llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn b/llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn index 004a7359698de..c174bf3c613f4 100644 --- a/llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/tools/driver/BUILD.gn @@ -23,6 +23,7 @@ group("symlinks") { executable("llvm-bolt") { configs += [ "//llvm/utils/gn/build:bolt_code" ] deps = [ + "//bolt/include/bolt/Core:TargetConfig.def", "//bolt/lib/Profile", "//bolt/lib/Rewrite", "//bolt/lib/Utils", diff --git a/llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn b/llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn index b6270106dbaf8..78b65a12e945a 100644 --- a/llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/tools/heatmap/BUILD.gn @@ -1,6 +1,7 @@ executable("llvm-bolt-heatmap") { configs += [ "//llvm/utils/gn/build:bolt_code" ] deps = [ + "//bolt/include/bolt/Core:TargetConfig.def", "//bolt/lib/Profile", "//bolt/lib/Rewrite", "//bolt/lib/Utils", diff --git a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn index c7c9459fdff16..79f19a416c0e1 100644 --- a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn @@ -4,6 +4,7 @@ import("//third-party/unittest/unittest.gni") unittest("CoreTests") { configs += [ "//llvm/utils/gn/build:bolt_code" ] deps = [ + "//bolt/include/bolt/Core:TargetConfig.def", "//bolt/lib/Core", "//bolt/lib/Rewrite", "//bolt/lib/Profile", diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index c9f3af65a4565..5a13545a15b13 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -1,10 +1,10 @@ import("//llvm/include/llvm/Config/config.gni") import("//llvm/lib/DebugInfo/PDB/enable_dia.gni") -import("//llvm/lib/Target/targets.gni") import("//llvm/lib/Target/targets_with_asm_parsers.gni") import("//llvm/lib/Target/targets_with_disassemblers.gni") import("//llvm/lib/Target/targets_with_exegesis.gni") import("//llvm/lib/Target/targets_with_mcas.gni") +import("//llvm/lib/Target/write_target_def_file.gni") import("//llvm/triples.gni") import("//llvm/utils/gn/build/buildflags.gni") import("//llvm/utils/gn/build/libs/curl/enable.gni") @@ -477,65 +477,41 @@ write_cmake_config("llvm-config") { ############################################################################## # .def files used by llvm/lib/Target -template("write_target_def_file") { - assert(defined(invoker.key), "must set 'key' in $target_name") - assert(defined(invoker.value), "must set 'value' in $target_name") - - write_cmake_config(target_name) { - visibility = [ ":write_target_def_files" ] - input = "$target_name.in" - output = "$target_gen_dir/$target_name" - - if (defined(invoker.all_targets)) { - all_targets = invoker.all_targets - } else { - all_targets = llvm_targets_to_build - } - - # Build something like - # `LLVM_ENUM_ASM_PARSERS=LLVM_ASM_PARSER(ARM)\nLLVM_ASM_PARSER(X86)\n`. Note - # that \n is a literal '\' followed by a literal 'n', not a newline - # character. (write_cmake_config.py replaces that with a real newline). - value = "" - foreach(target, all_targets) { - value = "$value${invoker.value}($target)\n" - } - if (all_targets == []) { - not_needed(invoker, [ "value" ]) - } - values = [ "${invoker.key}=$value" ] - } -} - write_target_def_file("AsmParsers.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_ASM_PARSERS" value = "LLVM_ASM_PARSER" all_targets = targets_with_asm_parsers } write_target_def_file("AsmPrinters.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_ASM_PRINTERS" value = "LLVM_ASM_PRINTER" } write_target_def_file("Disassemblers.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_DISASSEMBLERS" value = "LLVM_DISASSEMBLER" all_targets = targets_with_disassemblers } write_target_def_file("Targets.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_TARGETS" value = "LLVM_TARGET" } write_target_def_file("TargetMCAs.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_TARGETMCAS" value = "LLVM_TARGETMCA" all_targets = targets_with_mcas } write_target_def_file("TargetExegesis.def") { + visibility = [ ":write_target_def_files" ] key = "LLVM_ENUM_EXEGESIS" value = "LLVM_EXEGESIS" all_targets = targets_with_exegesis diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/write_target_def_file.gni b/llvm/utils/gn/secondary/llvm/lib/Target/write_target_def_file.gni new file mode 100644 index 0000000000000..8ff5edeb41f3d --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/lib/Target/write_target_def_file.gni @@ -0,0 +1,36 @@ +import("//llvm/lib/Target/targets.gni") +import("//llvm/utils/gn/build/write_cmake_config.gni") + +template("write_target_def_file") { + assert(defined(invoker.key), "must set 'key' in $target_name") + assert(defined(invoker.value), "must set 'value' in $target_name") + + write_cmake_config(target_name) { + input = "$target_name.in" + output = "$target_gen_dir/$target_name" + + if (defined(invoker.all_targets)) { + all_targets = invoker.all_targets + } else { + all_targets = llvm_targets_to_build + } + + if (defined(invoker.visibility)) { + visibility = invoker.visibility + } + + # Build something like + # `LLVM_ENUM_ASM_PARSERS=LLVM_ASM_PARSER(ARM)\nLLVM_ASM_PARSER(X86)\n`. Note + # that \n is a literal '\' followed by a literal 'n', not a newline + # character. (write_cmake_config.py replaces that with a real newline). + value = "" + foreach(target, all_targets) { + value = "$value${invoker.value}($target)\n" + } + if (all_targets == []) { + not_needed(invoker, [ "value" ]) + } + values = [ "${invoker.key}=$value" ] + } +} + diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md index 9fb0aaab06461..eda48a44cf023 100644 --- a/mlir/docs/PassManagement.md +++ b/mlir/docs/PassManagement.md @@ -809,11 +809,6 @@ def MyPass : Pass<"my-pass", "ModuleOp"> { its various constraints and behavior. }]; - // A constructor must be provided to specify how to create a default instance - // of MyPass. It can be skipped for this specific example, because both the - // constructor and the registration methods live in the same namespace. - let constructor = "foo::createMyPass()"; - // Specify any options. let options = [ Option<"option", "example-option", "bool", /*default=*/"true", @@ -883,8 +878,7 @@ struct MyPassOptions { #endif // GEN_PASS_DECL_MYPASS ``` -If the `constructor` field has not been specified in the tablegen declaration, -then autogenerated file will also contain the declarations of the default +The utogenerated file will also contain the declarations of the default constructors. ```c++ @@ -927,9 +921,8 @@ struct MyPass : foo::impl::MyPassBase { These definitions can be enabled on a per-pass basis by defining the appropriate preprocessor `GEN_PASS_DEF_PASSNAME` macro, with `PASSNAME` equal to the uppercase version of the name of the pass definition in tablegen. -If the `constructor` field has not been specified in tablegen, then the default -constructors are also defined and expect the name of the actual pass class to -be equal to the name defined in tablegen. +The default constructors are also defined and expect the name of the actual pass +class to be equal to the name defined in tablegen. Using the `gen-pass-doc` generator, markdown documentation for each of the passes can be generated. See [Passes.md](Passes.md) for example output of real @@ -951,12 +944,14 @@ contains the following fields: * `dependentDialects` - A list of strings representing the `Dialect` classes this pass may introduce entities, Attributes/Operations/Types/etc., of. -* `constructor` - - A code block used to create a default instance of the pass. * `options` - A list of pass options used by the pass. * `statistics` - A list of pass statistics used by the pass. +* `constructor` + - A code block used to create a default instance of the pass. + Specifying it will disable the constructors auto-generation for the + pass. This is a legacy option, it is not advised to use it. #### Options diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md index d15e7e5a80678..9df4647299010 100644 --- a/mlir/docs/PatternRewriter.md +++ b/mlir/docs/PatternRewriter.md @@ -361,7 +361,7 @@ This driver comes in two fashions: * `applyPatternsGreedily` ("region-based driver") applies patterns to all ops in a given region or a given container op (but not the container op itself). I.e., the worklist is initialized with all containing ops. -* `applyOpPatternsAndFold` ("op-based driver") applies patterns to the +* `applyOpPatternsGreedily` ("op-based driver") applies patterns to the provided list of operations. I.e., the worklist is initialized with the specified list of ops. diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h index a3714c4332fbb..6aa0900d1412a 100644 --- a/mlir/include/mlir/Analysis/DataFlowFramework.h +++ b/mlir/include/mlir/Analysis/DataFlowFramework.h @@ -146,7 +146,7 @@ struct ProgramPoint : public StorageUniquer::BaseStorage { Operation *op = nullptr; }; -inline raw_ostream &operator<<(raw_ostream &os, ProgramPoint point) { +inline raw_ostream &operator<<(raw_ostream &os, const ProgramPoint &point) { point.print(os); return os; } @@ -662,7 +662,7 @@ inline raw_ostream &operator<<(raw_ostream &os, const AnalysisState &state) { return os; } -inline raw_ostream &operator<<(raw_ostream &os, LatticeAnchor anchor) { +inline raw_ostream &operator<<(raw_ostream &os, const LatticeAnchor &anchor) { anchor.print(os); return os; } diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h index c950ef220f692..9a890ae24d8fc 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h +++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h @@ -108,6 +108,10 @@ class TargetOptions { /// Returns the default compilation target: `CompilationTarget::Fatbin`. static CompilationTarget getDefaultCompilationTarget(); + /// Returns a tokenization of the command line options. + static std::pair> + tokenizeCmdOptions(const std::string &cmdOptions); + protected: /// Derived classes must use this constructor to initialize `typeID` to the /// appropiate value: ie. `TargetOptions(TypeID::get())`. diff --git a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h index caa0901bb4943..035235fc7174a 100644 --- a/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h @@ -37,6 +37,11 @@ struct GPUToNVVMPipelineOptions *this, "cubin-format", llvm::cl::desc("Compilation format to use to serialize to cubin."), llvm::cl::init("fatbin")}; + PassOptions::Option cmdOptions{ + *this, "ptxas-cmd-options", + llvm::cl::desc( + "Command line options to pass to the downstream compiler."), + llvm::cl::init("")}; PassOptions::Option optLevel{ *this, "opt-level", llvm::cl::desc("Optimization level for NVVM compilation"), diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td index e055164a1c384..faf4c9ddbc7a7 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -143,6 +143,9 @@ def GpuNVVMAttachTarget: Pass<"nvvm-attach-target", ""> { "Enable flush to zero for denormals.">, ListOption<"linkLibs", "l", "std::string", "Extra bitcode libraries paths to link to.">, + Option<"cmdOptions", "ptxas-cmd-options", "std::string", + /*default=*/ [{""}], + "Command line options passed to downstream compiler">, ]; } diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index fe15a524ec3b5..0de5a87e72c3f 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -2862,6 +2862,8 @@ def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target"> { bool hasFlag(StringRef flag) const; bool hasFastMath() const; bool hasFtz() const; + bool hasCmdOptions() const; + std::optional getCmdOptions() const; }]; let extraClassDefinition = [{ bool $cppClass::hasFlag(StringRef flag) const { @@ -2875,6 +2877,12 @@ def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target"> { bool $cppClass::hasFtz() const { return hasFlag("ftz"); } + bool $cppClass::hasCmdOptions() const { + return hasFlag("ptxas-cmd-options"); + } + std::optional $cppClass::getCmdOptions() const { + return getFlags().getNamed("ptxas-cmd-options"); + } }]; } diff --git a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt index 71214b4404c55..efd708c5e5a11 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt @@ -65,6 +65,13 @@ add_public_tablegen_target(MLIRLinalgStructuredOpsIncGen) add_dependencies(MLIRLinalgStructuredOpsIncGen LinalgOdsGen) add_dependencies(mlir-headers MLIRLinalgStructuredOpsIncGen) +set(LLVM_TARGET_DEFINITIONS LinalgRelayoutOps.td) +mlir_tablegen(LinalgRelayoutOps.h.inc -gen-op-decls) +mlir_tablegen(LinalgRelayoutOps.cpp.inc -gen-op-defs) +add_public_tablegen_target(MLIRLinalgRelayoutOpsIncGen) +add_dependencies(MLIRLinalgRelayoutOpsIncGen LinalgOdsGen) +add_dependencies(mlir-headers MLIRLinalgRelayoutOpsIncGen) + set(LLVM_TARGET_DEFINITIONS LinalgInterfaces.td) mlir_tablegen(LinalgInterfaces.h.inc -gen-op-interface-decls) mlir_tablegen(LinalgInterfaces.cpp.inc -gen-op-interface-defs) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h b/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h index cb046baead87b..0a3b7f1e3f5c0 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h +++ b/mlir/include/mlir/Dialect/Linalg/IR/Linalg.h @@ -124,4 +124,7 @@ OpFoldResult createFoldedDimOp(OpBuilder &b, Location loc, Value val, #define GET_OP_CLASSES #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.h.inc" +#define GET_OP_CLASSES +#include "mlir/Dialect/Linalg/IR/LinalgRelayoutOps.h.inc" + #endif // MLIR_DIALECT_LINALG_IR_LINALG_H diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td index dbc1ac60e0973..247afc141c180 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td @@ -178,6 +178,16 @@ def LinalgConvolutionOpInterface : OpInterface<"ConvolutionOpInterface"> { ]; } +def LinalgRelayoutOpInterface : OpInterface<"RelayoutOpInterface"> { + let description = [{ + A Linalg relayout-op is either linalg.pack or linalg.unpack. + + While we could extend this interface with methods from Linalg_RelayoutOp, + this is currently not needed and left as a TODO. + }]; + let cppNamespace = "::mlir::linalg"; +} + def LinalgFillOpInterface : OpInterface<"FillOpInterface"> { let description = [{ A fill operation is defined in general terms: diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td new file mode 100644 index 0000000000000..a08a778fc25e1 --- /dev/null +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td @@ -0,0 +1,336 @@ +//===- LinalgReleayoutOps.td - Linalg relayout ops ---------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Pack + Unpack Ops that have been moved from the Tensor +// dialect. As such, these are defined as memory-effect-free and only accept +// "tensors" as inputs. +// +// TODO: Once a good motivating example is identified, relax these +// restrictions. +// +//===----------------------------------------------------------------------===// + +#ifndef LINALG_RELEAYOUT_OPS +#define LINALG_RELEAYOUT_OPS + +include "mlir/Dialect/Linalg/IR/LinalgBase.td" +include "mlir/Interfaces/DestinationStyleOpInterface.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Dialect/Linalg/IR/LinalgInterfaces.td" +include "mlir/IR/OpAsmInterface.td" + +//===----------------------------------------------------------------------===// +// RelayoutOp +//===----------------------------------------------------------------------===// + +class Linalg_RelayoutOp traits = []> : + Op, + DestinationStyleOpInterface, LinalgRelayoutOpInterface, + ConditionallySpeculatable, NoMemoryEffect, + DeclareOpInterfaceMethods, + TypesMatchWith<"result type matches type of dest", + "dest", "result", + "$_self">])> { + + code commonExtraClassDeclaration = [{ + size_t getSourceRank() { return getSourceType().getRank(); }; + size_t getDestRank() { return getDestType().getRank(); }; + RankedTensorType getSourceType() { + return ::llvm::cast(getSource().getType()); }; + RankedTensorType getDestType() { + return ::llvm::cast(getDest().getType()); }; + + MutableOperandRange getDpsInitsMutable() { return getDestMutable(); } + + /// Interface method for ConditionallySpeculatable. + Speculation::Speculatability getSpeculatability(); + + /// Return a mapping from positions `inner_dims_pos` to their + /// tile factors. + DenseMap getDimAndTileMapping(); + + /// Return the tile sizes as OpFoldResult. + SmallVector getMixedTiles(); + + /// Return the tile sizes as `int64_t`. If a tile size is dynamic + /// a sentinel `kDynamic` is introduced at that position in + /// the returned vector. + SmallVector getStaticTiles(); + + /// Retrieve all outer dims for this Pack/UnPack Op, i.e. all the leading + /// dims excluding the trailing dims corresponding to `innerTiles`. Note + /// that this will include both tiled and non-tiled dimensions. The order + /// of the output dimensions is consistent with the shape of the packed + /// tensor. + ArrayRef getAllOuterDims(); + + /// Similar to `getAllOuterDims`, but only retrieve the outer dims that + /// have been tiled. Also, the order of the output dimensions is consistent + /// with `inner_dims_pos` rather than the packed tensor. + SmallVector getTiledOuterDims(); + }]; + + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// +// PackOp +//===----------------------------------------------------------------------===// + +def Linalg_PackOp : Linalg_RelayoutOp<"pack", [ + AttrSizedOperandSegments]> { + let summary = "linalg.pack operation"; + let description = [{ + The "pack" operation converts a source tensor of rank `n` into a result + tensor of rank `n + k` with a tiled and packed layout (maybe with padding) + and optionally transposes the tiled source tensor dimensions. + + `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions that are + being tiled, where `0 < k <= n`. The order of the dimensions matters: + - The tiled dimensions (of size `inner_tiles`) are added to the end of the result + tensor in the order in which they appear in `inner_dims_pos`. + - `inner_dims_pos[i]` specifies the source tensor dimension tiled by + `inner_tiles[i]`. + + `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes + correspond to the least significant ("inner") result tensor dimension sizes, + in the same order. Tile sizes can be static or dynamic. + + Example: If `inner_tiles = [16, 32]`, the result tensor has a shape of + `...x16x32`. If `inner_dims_pos = [0, 1]`, the 0th source dimension is tiled + by 16 and the 1st source dimension is tiled by 32. Other source dimensions + (if any) are not tiled. If `inner_dims_pos = [1, 0]`, the 1st dimension is + tiled by 16 and the 0th dimension is tiled by 32. + + Example: + ```mlir + // NC to NCnc + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32] + into %dest : tensor<128x256xf32> -> tensor<16x8 x 8x32 xf32> + // \ / \ / + // outer dims inner dims + ``` + + `outer_dims_perm` (optional) specifies a permutation for the outer + dimensions. If specified, it must have `n` elements. + + Example: + ```mlir + // CK to KCck + %0 = linalg.pack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest + : tensor<128x256xf32> -> tensor<8x16 x 8x32 xf32> + // \ / + // compare with "NC to NCnc": outer dims are transposed + ``` + + `padding_value` specifies a padding value at the boundary on non-perfectly + divisible dimensions. Padding is optional: + - If absent, it is UB if the tile does not perfectly divide the dimension. + - If present, it will pad along high dimensions (high-padding) to make the + tile complete. + + Example: + ```mlir + %0 = linalg.pack %arg0 padding_value(%pad : f32) outer_dims_perm = [2, 1, 0] + inner_dims_pos = [1] inner_tiles = [2] into %arg1 + : tensor<200x127x256xf32> -> tensor<256x64x200x2xf32> + // \ + // padded and tiled dim + // + // Source dimension 1 is tiled. 64 does not divide 127 evenly, so 1 padded + // element is added at the end. + // + // Note: Only tiled dimensions can be padded. + ``` + }]; + let arguments = (ins AnyRankedTensor:$source, + AnyRankedTensor:$dest, + Optional:$padding_value, + DefaultValuedOptionalAttr:$outer_dims_perm, + DenseI64ArrayAttr:$inner_dims_pos, + Variadic:$inner_tiles, + DenseI64ArrayAttr:$static_inner_tiles); + let results = (outs AnyRankedTensor:$result); + let assemblyFormat = [{ + $source + (`padding_value` `(` $padding_value^ `:` type($padding_value) `)`)? + (`outer_dims_perm` `=` $outer_dims_perm^)? + `inner_dims_pos` `=` $inner_dims_pos + `inner_tiles` `=` + custom($inner_tiles, $static_inner_tiles) + `into` $dest attr-dict `:` type($source) `->` type($dest) + }]; + + let builders = [ + OpBuilder<(ins "Value":$source, "Value":$dest, + "ArrayRef":$innerDimsPos, + "ArrayRef":$innerTiles, + CArg<"std::optional", "std::nullopt">:$paddingValue, + CArg<"ArrayRef", "{}">:$outerDimsPerm)> + ]; + + let extraClassDeclaration = commonExtraClassDeclaration # [{ + // Method to get the shape of the result as `SmallVector`. + // This is a static method to allow getting the shape of the destination + // expected while creating a `pack` op. + static SmallVector getResultShape(OpBuilder &builder, + Location loc, ArrayRef sourceDims, + ArrayRef innerTileDims, ArrayRef innerDimsPos, + ArrayRef outerDimsPerm = {}); + + // Method to get the `RankedTensorType` of the result based on the inner + // tiles, position of the inner tiles (innerDimsPos) and interchange vector + // of outer loops (outerDimsPerm). + static RankedTensorType inferPackedType(RankedTensorType sourceType, + ArrayRef innerTileSizes, ArrayRef innerDimsPos, + ArrayRef outerDimsPerm = {}); + + // Returns true if we have enough static information to catch undefined + // behavior when the tile size does not divide perfectly the dimension of + // the input tensor. Detecting UB requires that the input size and either + // corresponding tile or output size are static. + static bool requirePaddingValue(ArrayRef inputShape, + ArrayRef innerDimsPos, + ArrayRef outputShape, + ArrayRef outerDimsPerm, + ArrayRef innerTiles); + + static Value createDestinationTensor(OpBuilder &b, Location loc, + Value source, ArrayRef innerTileSizes, + ArrayRef innerDimsPos, ArrayRef outerDimsPerm); + + /// Build and return a new PackOp that is a clone of the current PackOp with + /// (innerDimsPos, innerTiles) (resp. outerDimsPerm) are permuted by + /// innerPermutation (resp. outerPermutation). + /// A new `tensor.empty` of the proper shape is built in the process. + /// Asserts that: + /// - At least one of innerPermutation or outerPermutation is non-empty. + /// - If not empty, innerPermutation is a valid permutation of size + /// matching innerDimPos. + /// - If not empty, outerPermutation is a valid permutation of size + /// matching outerDimsPerm. + PackOp createTransposedClone(OpBuilder &b, + Location loc, + ArrayRef innerPermutation, + ArrayRef outerPermutation); + + /// Check if this PackOp is like a simple pad operation. + /// In other words, this operation: + /// 1. adds useless dimensions (dimension of size 1), + /// 2. pads the other ones, and + /// 3. doesn't shuffle the dimensions + bool isLikePad(); + }]; + + let hasCanonicalizeMethod = 1; + + let hasFolder = 1; +} + +//===----------------------------------------------------------------------===// +// UnPackOp +//===----------------------------------------------------------------------===// + +def Linalg_UnPackOp : Linalg_RelayoutOp<"unpack"> { + let summary = "linalg.unpack operation"; + let description = [{ + The "unpack" operation converts a source tensor of rank `n` with a tiled and + packed layout to a result tensor of rank `n - k`. + + `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions with + which the last `k` source tensor dimensions are combined, where + `0 < k <= n/2`. Each `inner_dims_pos` element must be `>= 0` and `< n - k`. + The order of the dimensions in `inner_dims_pos` matters: dimension + `inner_dims_pos[i]` is combined with dimension `n - k + i` (assuming that + `outer_dims_perm` is not specified). + + `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes + correspond to the least significant ("inner") source tensor dimension sizes. + The behavior of this op is undefined if: + - `inner_tiles` do not exactly match with the corresponding source tensor + dimension sizes. + - Or, `inner_tiles[i]` does not divide the size of dimension + `inner_dims_pos[i]` (assuming that `outer_dims_perm` is not specified) + evenly. + + `outer_dims_perm` (optional) specifies a permutation for the outer + dimensions. If specified, it must have `n - k` elements. If specified, this + permutation is applied before combining any dimensions. + + Example: + + ```mlir + // NCnc to NC: + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32] + into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32> + + // CK to KCck: + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest + : tensor<8x16x8x32xf32> -> tensor<128x256xf32> + ``` + }]; + let arguments = (ins AnyRankedTensor:$source, + AnyRankedTensor:$dest, + DefaultValuedOptionalAttr:$outer_dims_perm, + DenseI64ArrayAttr:$inner_dims_pos, + Variadic:$inner_tiles, + DenseI64ArrayAttr:$static_inner_tiles); + let results = (outs AnyRankedTensor:$result); + let assemblyFormat = [{ + $source + (`outer_dims_perm` `=` $outer_dims_perm^)? + `inner_dims_pos` `=` $inner_dims_pos + `inner_tiles` `=` + custom($inner_tiles, $static_inner_tiles) + `into` $dest attr-dict `:` type($source) `->` type($dest) + }]; + + let builders = [ + OpBuilder<(ins "Value":$source, "Value":$dest, + "ArrayRef":$innerDimsPos, + "ArrayRef":$innerTiles, + CArg<"ArrayRef", "{}">:$outerDimsPerm)> + ]; + + let extraClassDeclaration = commonExtraClassDeclaration # [{ + static Value createDestinationTensor(OpBuilder &b, Location loc, + Value source, ArrayRef innerTileSizes, + ArrayRef innerDimsPos, ArrayRef outerDimsPerm); + + /// Build and return a new UnPackOp that is a clone of the current UnPackOp + /// with (innerDimsPos, innerTiles) (resp. outerDimsPerm) are permuted by + /// innerPermutation (resp. outerPermutation). + /// Asserts that: + /// - At least one of innerPermutation or outerPermutation is non-empty. + /// - If not empty, innerPermutation is a valid permutation of size + /// matching innerDimPos. + /// - If not empty, outerPermutation is a valid permutation of size + /// matching outerDimsPerm. + UnPackOp createTransposedClone(OpBuilder &b, + Location loc, + Value transposedSource, + ArrayRef innerPermutation, + ArrayRef outerPermutation); + + /// Check if this UnPackOp is like a simple unpad operation. + /// In other words, this operation: + /// 1. drops useless dimensions (dimension of size 1), and + /// 2. reduces dimensions in place (i.e., no transpose.) + bool isLikeUnPad(); + }]; + + let hasCanonicalizeMethod = 1; + + let hasFolder = 1; +} + +#endif // LINALG_RELEAYOUT_OPS diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index e86d175489775..12080cee85c9d 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -45,7 +45,7 @@ def ApplyDecomposeTensorPackUnpackPatternsOp : Op]> { let description = [{ - Collect patterns to decompose tensor.pack and tensor.unpack into e.g. + Collect patterns to decompose linalg.pack and linalg.unpack into e.g. tensor::PadOp, linalg::transposeOp Ops. Requires all outer dims to be unit. }]; @@ -126,6 +126,28 @@ def ApplyPadVectorizationPatternsOp : Op]> { + let description = [{ + Indicates that operations like tensor.pad and tensor.extract_slice should + be folded into linalg.pack and linalg.unpack operations, respectively. + }]; + + let assemblyFormat = "attr-dict"; +} + +def ApplyFoldPackUnpackIntoEmptyPatternsOp : Op]> { + let description = [{ + // TODO: + }]; + + let arguments = (ins DefaultValuedAttr:$fold_single_use_only); + let assemblyFormat = "attr-dict"; +} + //===----------------------------------------------------------------------===// // BufferizeToAllocationOp //===----------------------------------------------------------------------===// @@ -547,19 +569,18 @@ def LowerPackOp : Op { let description = [{ - Rewrite a tensor.pack into tensor.pad + tensor.expand_shape + linalg.transpose. + Rewrite a linalg.pack into tensor.pad + tensor.expand_shape + linalg.transpose. #### Return modes - This operation ignores non-pack ops and drops them in the return. - This operation produces a silenceable failure if the rewrite fails for any - reason. - If all the operations referred to by the `target` are rewritten, the - transform succeeds. - Return handles to the newly produced pad, expand_shape and transpose ops. + This operation ignores non-pack ops and drops them in the return. This + operation produces a silenceable failure if the rewrite fails for any + reason. If all the operations referred to by the `target` are rewritten, + the transform succeeds. Return handles to the newly produced pad, + expand_shape and transpose ops. }]; - let arguments = (ins Transform_ConcreteOpType<"tensor.pack">:$target, + let arguments = (ins Transform_ConcreteOpType<"linalg.pack">:$target, DefaultValuedAttr:$lowerPadLikeWithInsertSlice); let results = (outs Transform_ConcreteOpType<"tensor.pad">:$pad_op, Transform_ConcreteOpType<"tensor.expand_shape">:$expand_shape_op, @@ -571,7 +592,7 @@ def LowerPackOp : Op { let description = [{ - Lower a tensor.unpack into empty + linalg.transpose + tensor.collapse_shape + + Lower a linalg.unpack into empty + linalg.transpose + tensor.collapse_shape + tensor.extract_slice. #### Return modes - This operation ignores non-unpack ops and drops them in the return. - This operation produces a silenceable failure if the rewrite fails for any - reason. - If all the operations referred to by the `target` are rewritten, the - transform succeeds. - Return handles to the newly produced empty, transpose, collapse_shape and extract_slice ops. + This operation ignores non-unpack ops and drops them in the return. This + operation produces a silenceable failure if the rewrite fails for any + reason. If all the operations referred to by the `target` are rewritten, + the transform succeeds. Return handles to the newly produced empty, + transpose, collapse_shape and extract_slice ops. }]; - let arguments = (ins Transform_ConcreteOpType<"tensor.unpack">:$target, + let arguments = (ins Transform_ConcreteOpType<"linalg.unpack">:$target, DefaultValuedAttr:$lowerUnpadLikeWithExtractSlice); let results = (outs Transform_ConcreteOpType<"tensor.empty">:$empty_op, Transform_ConcreteOpType<"linalg.transpose">:$transpose_op, @@ -613,7 +633,7 @@ def LowerUnPackOp : Op, ReportTrackingListenerFailuresOpTrait]> { let description = [{ - Apply a transposition to a single `tensor.pack` (resp. `tensor.unpack`) and + Apply a transposition to a single `linalg.pack` (resp. `linalg.unpack`) and update the `linalg.generic` op that consumes (resp. produces) the operation. This transform allows composing a simple `structured.pack` with additional @@ -989,19 +1009,19 @@ def PackTransposeOp : Op lowerPack(RewriterBase &rewriter, - tensor::PackOp packOp, + linalg::PackOp packOp, bool lowerPadLikeWithInsertSlice = true); struct LowerUnPackOpResult { @@ -1134,14 +1134,14 @@ struct LowerUnPackOpResult { /// Rewrite pack as empty + transpose + reshape + extract_slice. FailureOr -lowerUnPack(RewriterBase &rewriter, tensor::UnPackOp unPackOp, +lowerUnPack(RewriterBase &rewriter, linalg::UnPackOp unPackOp, bool lowerUnpadLikeWithExtractSlice = true); /// Struct to hold the result of a `pack` call. struct PackResult { - SmallVector packOps; + SmallVector packOps; linalg::LinalgOp packedLinalgOp; - SmallVector unPackOps; + SmallVector unPackOps; }; /// Implement packing of a single LinalgOp by `packedSizes`. /// There must be one packedSizes entry per `linalgOp` iterator. @@ -1151,9 +1151,9 @@ FailureOr pack(RewriterBase &rewriter, linalg::LinalgOp linalgOp, /// Struct to hold the result of a `packTranspose` call. struct PackTransposeResult { - tensor::PackOp transposedPackOp; + linalg::PackOp transposedPackOp; linalg::LinalgOp transposedLinalgOp; - tensor::UnPackOp transposedUnPackOp; + linalg::UnPackOp transposedUnPackOp; }; /// Transpose a single PackOp -> LinalgOp -> UnPackOp chain and return the /// transposed PackOp -> LinalgOp -> UnPackOp chain after replacements. @@ -1164,8 +1164,8 @@ struct PackTransposeResult { /// 3. `outerPerm` (resp. `innerPerm`) must be valid permutations of /// `packOp.getOuterDimsPerm` (resp. `packOp.getInnerDimsPerm`) or empty. FailureOr -packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, - linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp, +packTranspose(RewriterBase &rewriter, linalg::PackOp packOp, + linalg::LinalgOp linalgOp, linalg::UnPackOp maybeUnPackOp, ArrayRef outerPerm, ArrayRef innerPerm); /// Pack a LinalgOp by greedily inferring matmul dimensions (m, n, k) where m @@ -1526,15 +1526,15 @@ struct DecomposePadOpPattern : public OpRewritePattern { const SmallVector &dynSizes) const; }; -/// Rewrites a tensor::PackOp into a sequence of: +/// Rewrites a linalg::PackOp into a sequence of: /// * tensor::PadOp + linalg::TransposeOp + tensor::EmptyOp + /// tensor::InsertSliceOp ops. /// -/// Requires that all the outer dims of the input tensor::PackOp are 1. +/// Requires that all the outer dims of the input linalg::PackOp are 1. /// /// Before: /// ``` -/// %packed = tensor.pack %input +/// %packed = linalg.pack %input /// padding_value(%pad : f32) /// inner_dims_pos = [1, 0] /// inner_tiles = [2, %high] @@ -1560,20 +1560,20 @@ struct DecomposePadOpPattern : public OpRewritePattern { /// : tensor<2x?xf32> into tensor<1x1x2x?xf32> /// ``` struct DecomposeOuterUnitDimsPackOpPattern - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(tensor::PackOp packOp, + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override; }; -/// Rewrites a tensor::UnPackOp into a sequence of rank-reduced +/// Rewrites a linalg::UnPackOp into a sequence of rank-reduced /// * tensor::ExtractSliceOp + linalg::TransposeOp + tensor::InsertSliceOp /// -/// Requires that all the outer dims of the input tensor::PackOp are 1. +/// Requires that all the outer dims of the input linalg::PackOp are 1. /// /// Before: /// ``` -/// %packed = tensor.unpack %input +/// %packed = linalg.unpack %input /// inner_dims_pos = [1, 0] /// inner_tiles = [2, 8] /// into %output : tensor<1x1x2x8xf32> -> tensor<5x1xf32> @@ -1594,9 +1594,9 @@ struct DecomposeOuterUnitDimsPackOpPattern /// : tensor<8x2xf32> to tensor<5x1xf32> /// ``` struct DecomposeOuterUnitDimsUnPackOpPattern - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(tensor::UnPackOp unpackOp, + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(linalg::UnPackOp unpackOp, PatternRewriter &rewriter) const override; }; @@ -1718,7 +1718,7 @@ void populateLinalgGenericOpsSpecializationPatterns( void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns, PatternBenefit benefit = 1); -/// Populates patterns to decompose tensor.pack and tensor.unpack Ops into e.g. +/// Populates patterns to decompose linalg.pack and linalg.unpack Ops into e.g. /// tensor.pad, linalg.transpose, tensor.{insert|extract}_slice. Require all /// outer dims to be unit. void populateDecomposePackUnpackPatterns(RewritePatternSet &patterns); @@ -1781,7 +1781,7 @@ void populateElementwiseOpsFusionPatterns( const ControlFusionFn &controlElementwiseOpFusion, bool replaceOutsDependency = true); -/// Function type which is used to control propagation of tensor.pack/unpack +/// Function type which is used to control propagation of linalg.pack/unpack /// ops. using ControlPropagationFn = std::function; @@ -1890,6 +1890,19 @@ void populateDecomposeWinogradOpsPatterns(RewritePatternSet &patterns); /// convert to a `linalg.dot`. void populateContractionOpRankReducingPatterns(RewritePatternSet &patterns); +/// Populates `patterns` with patterns that fold operations like `tensor.pad` +/// and `tensor.extract_slice` into `tensor.pack` and `tensor.unpack` operations +/// respectively. +void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns); + +/// Populates `patterns` with patterns that fold operations like `linalg.pack` +/// and `linalg.unpack` into `tensor.empty`. +void populateFoldPackUnpackIntoTensorEmptyPatterns(RewritePatternSet &patterns); + +/// Populates `patterns` with patterns that simplify `tensor.pack` and +/// `tensor.unpack` operations. +void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns); + } // namespace linalg } // namespace mlir diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h index 1e4f3004dec7e..80aa034d2199d 100644 --- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -33,6 +33,24 @@ namespace linalg { //===----------------------------------------------------------------------===// // Utilities for inferring various semantics properties of Linalg ops. //===----------------------------------------------------------------------===// +/// Shell function to compute the Destination Permutation of PackOp +/// This function uses the helper function `computePackUnPackPerm` to get +/// the permutation vector. Only major difference between UnPack and Pack is +/// that packOp uses destination rank whereas unpack Uses source rank. +SmallVector getPackInverseDestPerm(linalg::PackOp packOp); + +/// Shell function to compute the Source Permutation of unPackOp. +/// This function, like the getPackInverseDestPerm uses the helper function +/// computePackUnPackPerm` to get the permutation vector. +/// Only major difference between UnPack and Pack is that packOp uses +/// destination rank whereas unpack Uses source rank. +SmallVector getUnPackInverseSrcPerm(linalg::UnPackOp unpackOp); + +/// Shell function to compute the Source rank permutation for unpackOp +/// Unpack requires some packing metadata data information, so created +/// another function where this value is passed by reference. +SmallVector getUnPackInverseSrcPerm(linalg::UnPackOp, + PackingMetadata &metadata); //===----------------------------------------------------------------------===// // General utilities diff --git a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h index 02ffa0da7a8b8..c0c11c9e38994 100644 --- a/mlir/include/mlir/Dialect/SCF/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/SCF/Utils/Utils.h @@ -126,6 +126,9 @@ FailureOr loopUnrollByFactor( scf::ForOp forOp, uint64_t unrollFactor, function_ref annotateFn = nullptr); +/// Unrolls this loop completely. +LogicalResult loopUnrollFull(scf::ForOp forOp); + /// Unrolls and jams this `scf.for` operation by the specified unroll factor. /// Returns failure if the loop cannot be unrolled either due to restrictions or /// due to invalid unroll factors. In case of unroll factor of 1, the function diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td index 1eacc564655a8..cafe140469570 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td @@ -4445,6 +4445,7 @@ def SPIRV_OC_OpSelectionMerge : I32EnumAttrCase<"OpSelectionMerge def SPIRV_OC_OpLabel : I32EnumAttrCase<"OpLabel", 248>; def SPIRV_OC_OpBranch : I32EnumAttrCase<"OpBranch", 249>; def SPIRV_OC_OpBranchConditional : I32EnumAttrCase<"OpBranchConditional", 250>; +def SPIRV_OC_OpKill : I32EnumAttrCase<"OpKill", 252>; def SPIRV_OC_OpReturn : I32EnumAttrCase<"OpReturn", 253>; def SPIRV_OC_OpReturnValue : I32EnumAttrCase<"OpReturnValue", 254>; def SPIRV_OC_OpUnreachable : I32EnumAttrCase<"OpUnreachable", 255>; @@ -4574,7 +4575,7 @@ def SPIRV_OpcodeAttr : SPIRV_OC_OpAtomicAnd, SPIRV_OC_OpAtomicOr, SPIRV_OC_OpAtomicXor, SPIRV_OC_OpPhi, SPIRV_OC_OpLoopMerge, SPIRV_OC_OpSelectionMerge, SPIRV_OC_OpLabel, SPIRV_OC_OpBranch, SPIRV_OC_OpBranchConditional, - SPIRV_OC_OpReturn, SPIRV_OC_OpReturnValue, SPIRV_OC_OpUnreachable, + SPIRV_OC_OpKill, SPIRV_OC_OpReturn, SPIRV_OC_OpReturnValue, SPIRV_OC_OpUnreachable, SPIRV_OC_OpGroupBroadcast, SPIRV_OC_OpGroupIAdd, SPIRV_OC_OpGroupFAdd, SPIRV_OC_OpGroupFMin, SPIRV_OC_OpGroupUMin, SPIRV_OC_OpGroupSMin, SPIRV_OC_OpGroupFMax, SPIRV_OC_OpGroupUMax, SPIRV_OC_OpGroupSMax, diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td index cc2f0e4962d8a..ade20f915c0c3 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td @@ -242,6 +242,48 @@ def SPIRV_FunctionCallOp : SPIRV_Op<"FunctionCall", [ // ----- +def SPIRV_KillOp : SPIRV_Op<"Kill", [Terminator]> { + let summary = [{ + Deprecated (use OpTerminateInvocation or OpDemoteToHelperInvocation). + }]; + + let description = [{ + Fragment-shader discard. + + Ceases all further processing in any invocation that executes it: Only + instructions these invocations executed before OpKill have observable + side effects. If this instruction is executed in non-uniform control + flow, all subsequent control flow is non-uniform (for invocations that + continue to execute). + + This instruction must be the last instruction in a block. + + This instruction is only valid in the Fragment Execution Model. + + + + #### Example: + + ```mlir + spirv.Kill + ``` + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[]>, + Capability<[SPIRV_C_Shader]> + ]; + + let arguments = (ins); + let results = (outs); + let assemblyFormat = "attr-dict"; + let hasVerifier = 0; +} + +// ----- + def SPIRV_LoopOp : SPIRV_Op<"mlir.loop", [InFunctionScope]> { let summary = "Define a structured loop."; diff --git a/mlir/include/mlir/Dialect/Tensor/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Tensor/IR/CMakeLists.txt index 74a05291376b3..cd14fe5c04561 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Tensor/IR/CMakeLists.txt @@ -1,8 +1,2 @@ add_mlir_dialect(TensorOps tensor) add_mlir_doc(TensorOps TensorOps Dialects/ -gen-dialect-doc) - -set(LLVM_TARGET_DEFINITIONS TensorInterfaces.td) -mlir_tablegen(TensorInterfaces.h.inc -gen-op-interface-decls) -mlir_tablegen(TensorInterfaces.cpp.inc -gen-op-interface-defs) -add_public_tablegen_target(MLIRTensorInterfacesIncGen) -add_dependencies(mlir-headers MLIRTensorInterfacesIncGen) diff --git a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h index b3ec796a72337..eb550bb469b9f 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h +++ b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h @@ -46,12 +46,6 @@ SmallVector getOrCreateRanges(OffsetSizeAndStrideOpInterface op, #include "mlir/Dialect/Tensor/IR/TensorOpsDialect.h.inc" -//===----------------------------------------------------------------------===// -// Tensor Interfaces -//===----------------------------------------------------------------------===// - -#include "mlir/Dialect/Tensor/IR/TensorInterfaces.h.inc" - //===----------------------------------------------------------------------===// // Tensor Dialect Operations //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorInterfaces.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorInterfaces.td deleted file mode 100644 index 522a9c56f3c92..0000000000000 --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorInterfaces.td +++ /dev/null @@ -1,33 +0,0 @@ -//===- TensorInterfaces.td - Tensor Interfaces Declaration -*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This is the definition file for the structured interface sfor Tensor ops. -// -//===----------------------------------------------------------------------===// - -#ifndef TENSOR_IR_TENSORINTERFACES -#define TENSOR_IR_TENSORINTERFACES - -include "mlir/Interfaces/DestinationStyleOpInterface.td" -include "mlir/IR/OpBase.td" - -// TODO: To be moved to LinalgInterfaces.td, see: -// * https://github.com/llvm/llvm-project/pull/123902 -// * https://discourse.llvm.org/t/rfc-move-tensor-pack-and-tensor-unpack-into-linalg/ -def TensorRelayoutOpInterface : OpInterface<"RelayoutOpInterface"> { - let description = [{ - A Tensor (soon to be Linalg) relayout-op is either tensor.pack or - tensor.unpack. - - While we could extend this interface with methods from Tensor_RelayoutOp, - this is currently not needed and left as a TODO. - }]; - let cppNamespace = "::mlir::tensor"; -} - -#endif // TENSOR_IR_TENSORINTERFACES diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td index f6927f5ebcfb8..35d0b16628417 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td +++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td @@ -10,7 +10,6 @@ #define TENSOR_OPS include "mlir/Dialect/Tensor/IR/TensorBase.td" -include "mlir/Dialect/Tensor/IR/TensorInterfaces.td" include "mlir/Interfaces/CastInterfaces.td" include "mlir/Interfaces/ControlFlowInterfaces.td" include "mlir/Interfaces/DestinationStyleOpInterface.td" @@ -1824,315 +1823,6 @@ def Tensor_SplatOp : Tensor_Op<"splat", [ let hasVerifier = 1; } -//===----------------------------------------------------------------------===// -// RelayoutOp -//===----------------------------------------------------------------------===// - -class Tensor_RelayoutOp traits = []> : - Tensor_Op, - DestinationStyleOpInterface, - ConditionallySpeculatable, NoMemoryEffect, - DeclareOpInterfaceMethods, - TensorRelayoutOpInterface, - TypesMatchWith<"result type matches type of dest", - "dest", "result", - "$_self">])> { - - code commonExtraClassDeclaration = [{ - size_t getSourceRank() { return getSourceType().getRank(); }; - size_t getDestRank() { return getDestType().getRank(); }; - RankedTensorType getSourceType() { - return ::llvm::cast(getSource().getType()); }; - RankedTensorType getDestType() { - return ::llvm::cast(getDest().getType()); }; - - MutableOperandRange getDpsInitsMutable() { return getDestMutable(); } - - /// Interface method for ConditionallySpeculatable. - Speculation::Speculatability getSpeculatability(); - - /// Return a mapping from positions `inner_dims_pos` to their - /// tile factors. - DenseMap getDimAndTileMapping(); - - /// Return the tile sizes as OpFoldResult. - SmallVector getMixedTiles(); - - /// Return the tile sizes as `int64_t`. If a tile size is dynamic - /// a sentinel `kDynamic` is introduced at that position in - /// the returned vector. - SmallVector getStaticTiles(); - - /// Retrieve all outer dims for this Pack/UnPack Op, i.e. all the leading - /// dims excluding the trailing dims corresponding to `innerTiles`. Note - /// that this will include both tiled and non-tiled dimensions. The order - /// of the output dimensions is consistent with the shape of the packed - /// tensor. - ArrayRef getAllOuterDims(); - - /// Similar to `getAllOuterDims`, but only retrieve the outer dims that - /// have been tiled. Also, the order of the output dimensions is consistent - /// with `inner_dims_pos` rather than the packed tensor. - SmallVector getTiledOuterDims(); - }]; - - let hasVerifier = 1; -} - -//===----------------------------------------------------------------------===// -// PackOp -//===----------------------------------------------------------------------===// - -def Tensor_PackOp : Tensor_RelayoutOp<"pack", [ - AttrSizedOperandSegments]> { - let summary = "tensor pack operation"; - let description = [{ - The "pack" operation converts a source tensor of rank `n` into a result - tensor of rank `n + k` with a tiled and packed layout (maybe with padding) - and optionally transposes the tiled source tensor dimensions. - - `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions that are - being tiled, where `0 < k <= n`. The order of the dimensions matters: - - The tiled dimensions (of size `inner_tiles`) are added to the end of the result - tensor in the order in which they appear in `inner_dims_pos`. - - `inner_dims_pos[i]` specifies the source tensor dimension tiled by - `inner_tiles[i]`. - - `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes - correspond to the least significant ("inner") result tensor dimension sizes, - in the same order. Tile sizes can be static or dynamic. - - Example: If `inner_tiles = [16, 32]`, the result tensor has a shape of - `...x16x32`. If `inner_dims_pos = [0, 1]`, the 0th source dimension is tiled - by 16 and the 1st source dimension is tiled by 32. Other source dimensions - (if any) are not tiled. If `inner_dims_pos = [1, 0]`, the 1st dimension is - tiled by 16 and the 0th dimension is tiled by 32. - - Example: - ```mlir - // NC to NCnc - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32] - into %dest : tensor<128x256xf32> -> tensor<16x8 x 8x32 xf32> - // \ / \ / - // outer dims inner dims - ``` - - `outer_dims_perm` (optional) specifies a permutation for the outer - dimensions. If specified, it must have `n` elements. - - Example: - ```mlir - // CK to KCck - %0 = tensor.pack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest - : tensor<128x256xf32> -> tensor<8x16 x 8x32 xf32> - // \ / - // compare with "NC to NCnc": outer dims are transposed - ``` - - `padding_value` specifies a padding value at the boundary on non-perfectly - divisible dimensions. Padding is optional: - - If absent, it is UB if the tile does not perfectly divide the dimension. - - If present, it will pad along high dimensions (high-padding) to make the - tile complete. - - Example: - ```mlir - %0 = tensor.pack %arg0 padding_value(%pad : f32) outer_dims_perm = [2, 1, 0] - inner_dims_pos = [1] inner_tiles = [2] into %arg1 - : tensor<200x127x256xf32> -> tensor<256x64x200x2xf32> - // \ - // padded and tiled dim - // - // Source dimension 1 is tiled. 64 does not divide 127 evenly, so 1 padded - // element is added at the end. - // - // Note: Only tiled dimensions can be padded. - ``` - }]; - let arguments = (ins AnyRankedTensor:$source, - AnyRankedTensor:$dest, - Optional:$padding_value, - DefaultValuedOptionalAttr:$outer_dims_perm, - DenseI64ArrayAttr:$inner_dims_pos, - Variadic:$inner_tiles, - DenseI64ArrayAttr:$static_inner_tiles); - let results = (outs AnyRankedTensor:$result); - let assemblyFormat = [{ - $source - (`padding_value` `(` $padding_value^ `:` type($padding_value) `)`)? - (`outer_dims_perm` `=` $outer_dims_perm^)? - `inner_dims_pos` `=` $inner_dims_pos - `inner_tiles` `=` - custom($inner_tiles, $static_inner_tiles) - `into` $dest attr-dict `:` type($source) `->` type($dest) - }]; - - let builders = [ - OpBuilder<(ins "Value":$source, "Value":$dest, - "ArrayRef":$innerDimsPos, - "ArrayRef":$innerTiles, - CArg<"std::optional", "std::nullopt">:$paddingValue, - CArg<"ArrayRef", "{}">:$outerDimsPerm)> - ]; - - let extraClassDeclaration = commonExtraClassDeclaration # [{ - // Method to get the shape of the result as `SmallVector`. - // This is a static method to allow getting the shape of the destination - // expected while creating a `pack` op. - static SmallVector getResultShape(OpBuilder &builder, - Location loc, ArrayRef sourceDims, - ArrayRef innerTileDims, ArrayRef innerDimsPos, - ArrayRef outerDimsPerm = {}); - - // Method to get the `RankedTensorType` of the result based on the inner - // tiles, position of the inner tiles (innerDimsPos) and interchange vector - // of outer loops (outerDimsPerm). - static RankedTensorType inferPackedType(RankedTensorType sourceType, - ArrayRef innerTileSizes, ArrayRef innerDimsPos, - ArrayRef outerDimsPerm = {}); - - // Returns true if we have enough static information to catch undefined - // behavior when the tile size does not divide perfectly the dimension of - // the input tensor. Detecting UB requires that the input size and either - // corresponding tile or output size are static. - static bool requirePaddingValue(ArrayRef inputShape, - ArrayRef innerDimsPos, - ArrayRef outputShape, - ArrayRef outerDimsPerm, - ArrayRef innerTiles); - - static Value createDestinationTensor(OpBuilder &b, Location loc, - Value source, ArrayRef innerTileSizes, - ArrayRef innerDimsPos, ArrayRef outerDimsPerm); - - /// Build and return a new PackOp that is a clone of the current PackOp with - /// (innerDimsPos, innerTiles) (resp. outerDimsPerm) are permuted by - /// innerPermutation (resp. outerPermutation). - /// A new `tensor.empty` of the proper shape is built in the process. - /// Asserts that: - /// - At least one of innerPermutation or outerPermutation is non-empty. - /// - If not empty, innerPermutation is a valid permutation of size - /// matching innerDimPos. - /// - If not empty, outerPermutation is a valid permutation of size - /// matching outerDimsPerm. - PackOp createTransposedClone(OpBuilder &b, - Location loc, - ArrayRef innerPermutation, - ArrayRef outerPermutation); - - /// Check if this PackOp is like a simple pad operation. - /// In other words, this operation: - /// 1. adds useless dimensions (dimension of size 1), - /// 2. pads the other ones, and - /// 3. doesn't shuffle the dimensions - bool isLikePad(); - }]; - - let hasCanonicalizeMethod = 1; - - let hasFolder = 1; -} - -//===----------------------------------------------------------------------===// -// UnPackOp -//===----------------------------------------------------------------------===// - -def Tensor_UnPackOp : Tensor_RelayoutOp<"unpack"> { - let summary = "tensor unpack operation"; - let description = [{ - The "unpack" operation converts a source tensor of rank `n` with a tiled and - packed layout to a result tensor of rank `n - k`. - - `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions with - which the last `k` source tensor dimensions are combined, where - `0 < k <= n/2`. Each `inner_dims_pos` element must be `>= 0` and `< n - k`. - The order of the dimensions in `inner_dims_pos` matters: dimension - `inner_dims_pos[i]` is combined with dimension `n - k + i` (assuming that - `outer_dims_perm` is not specified). - - `inner_tiles` (mandatory) specifies `k` tile sizes. These tile sizes - correspond to the least significant ("inner") source tensor dimension sizes. - The behavior of this op is undefined if: - - `inner_tiles` do not exactly match with the corresponding source tensor - dimension sizes. - - Or, `inner_tiles[i]` does not divide the size of dimension - `inner_dims_pos[i]` (assuming that `outer_dims_perm` is not specified) - evenly. - - `outer_dims_perm` (optional) specifies a permutation for the outer - dimensions. If specified, it must have `n - k` elements. If specified, this - permutation is applied before combining any dimensions. - - Example: - - ```mlir - // NCnc to NC: - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 32] - into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32> - - // CK to KCck: - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest - : tensor<8x16x8x32xf32> -> tensor<128x256xf32> - ``` - }]; - let arguments = (ins AnyRankedTensor:$source, - AnyRankedTensor:$dest, - DefaultValuedOptionalAttr:$outer_dims_perm, - DenseI64ArrayAttr:$inner_dims_pos, - Variadic:$inner_tiles, - DenseI64ArrayAttr:$static_inner_tiles); - let results = (outs AnyRankedTensor:$result); - let assemblyFormat = [{ - $source - (`outer_dims_perm` `=` $outer_dims_perm^)? - `inner_dims_pos` `=` $inner_dims_pos - `inner_tiles` `=` - custom($inner_tiles, $static_inner_tiles) - `into` $dest attr-dict `:` type($source) `->` type($dest) - }]; - - let builders = [ - OpBuilder<(ins "Value":$source, "Value":$dest, - "ArrayRef":$innerDimsPos, - "ArrayRef":$innerTiles, - CArg<"ArrayRef", "{}">:$outerDimsPerm)> - ]; - - let extraClassDeclaration = commonExtraClassDeclaration # [{ - static Value createDestinationTensor(OpBuilder &b, Location loc, - Value source, ArrayRef innerTileSizes, - ArrayRef innerDimsPos, ArrayRef outerDimsPerm); - - /// Build and return a new UnPackOp that is a clone of the current UnPackOp - /// with (innerDimsPos, innerTiles) (resp. outerDimsPerm) are permuted by - /// innerPermutation (resp. outerPermutation). - /// Asserts that: - /// - At least one of innerPermutation or outerPermutation is non-empty. - /// - If not empty, innerPermutation is a valid permutation of size - /// matching innerDimPos. - /// - If not empty, outerPermutation is a valid permutation of size - /// matching outerDimsPerm. - UnPackOp createTransposedClone(OpBuilder &b, - Location loc, - Value transposedSource, - ArrayRef innerPermutation, - ArrayRef outerPermutation); - - /// Check if this UnPackOp is like a simple unpad operation. - /// In other words, this operation: - /// 1. drops useless dimensions (dimension of size 1), and - /// 2. reduces dimensions in place (i.e., no transpose.) - bool isLikeUnPad(); - }]; - - let hasCanonicalizeMethod = 1; - - let hasFolder = 1; -} - //===----------------------------------------------------------------------===// // YieldOp //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td index 81bab1b0c82f7..fcb10f55d556d 100644 --- a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td +++ b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td @@ -53,16 +53,6 @@ def ApplyFoldTensorEmptyPatternsOp : Op:$fold_single_use_only); let assemblyFormat = "attr-dict"; } -def ApplyFoldIntoPackAndUnpackPatternsOp : Op]> { - let description = [{ - Indicates that operations like tensor.pad and tensor.extract_slice should - be folded into tensor.pack and tensor.unpack operations, respectively. - }]; - - let assemblyFormat = "attr-dict"; -} def ApplyFoldTensorSubsetOpsPatternsOp : Op; /// Populates `patterns` with patterns that replace tensor ops (such as diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h index ed1ec1e871482..83cc665b5a4fb 100644 --- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h +++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h @@ -42,25 +42,6 @@ FailureOr computeTransposedType(RankedTensorType rankedTensorType, ArrayRef transposeVector); -/// Shell function to compute the Destination Permutation of PackOp -/// This function uses the helper function `computePackUnPackPerm` to get -/// the permutation vector. Only major difference between UnPack and Pack is -/// that packOp uses destination rank whereas unpack Uses source rank. -SmallVector getPackInverseDestPerm(tensor::PackOp packOp); - -/// Shell function to compute the Source Permutation of unPackOp. -/// This function, like the getPackInverseDestPerm uses the helper function -/// computePackUnPackPerm` to get the permutation vector. -/// Only major difference between UnPack and Pack is that packOp uses -/// destination rank whereas unpack Uses source rank. -SmallVector getUnPackInverseSrcPerm(tensor::UnPackOp unpackOp); - -/// Shell function to compute the Source rank permutation for unpackOp -/// Unpack requires some packing metadata data information, so created -/// another function where this value is passed by reference. -SmallVector getUnPackInverseSrcPerm(tensor::UnPackOp, - PackingMetadata &metadata); - /// A tensor.insert_slice is a cast-like operation if it merely rank-extends the /// source tensor or inserts the source tensor into a destination tensor with /// the same shape. diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h index 3fa35bf1851a9..3af89a6ab3799 100644 --- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h +++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h @@ -568,6 +568,13 @@ struct PackingMetadata { // repeated N^2 counts). PackingMetadata computePackingMetadata(int64_t packedRank, ArrayRef innerDimPos); + +/// Try to remove a tensor operation if it would only reshape a constant. +/// Removes the op and replaces the constant with a new constant of the result +/// shape. When an optional cst attribute is passed, it is reshaped only if the +/// splat value matches the value in the attribute. +OpFoldResult reshapeConstantSource(DenseElementsAttr source, TensorType result, + std::optional cst = std::nullopt); } // namespace mlir #endif // MLIR_DIALECT_UTILS_RESHAPEOPSUTILS_H diff --git a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td index 16181d7e760db..566013e73f4b8 100644 --- a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td +++ b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td @@ -341,6 +341,46 @@ def DotBF16Ps512IntrOp : AVX512_IntrOp<"dpbf16ps.512", 1, [Pure, let results = (outs VectorOfLengthAndType<[16], [F32]>:$res); } +//----------------------------------------------------------------------------// +// Convert packed F32 to packed BF16 +//----------------------------------------------------------------------------// + +def CvtPackedF32ToBF16Op : AVX512_Op<"cvt.packed.f32_to_bf16", [Pure, + AllElementCountsMatch<["a", "dst"]>]> { + let summary = "Convert packed F32 to packed BF16 Data."; + let description = [{ + The `convert_f32_to_bf16` op is an AVX512-BF16 specific op that can lower + to the proper LLVMAVX512BF16 operation `llvm.cvtneps2bf16` depending on + the width of MLIR vectors it is applied to. + + #### From the Intel Intrinsics Guide: + + Convert packed single-precision (32-bit) floating-point elements in `a` to + packed BF16 (16-bit) floating-point elements, and store the results in `dst`. + + Example: + ```mlir + %dst = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<8xf32> -> vector<8xbf16> + ``` + }]; + let arguments = (ins VectorOfLengthAndType<[8, 16], [F32]>:$a); + let results = (outs VectorOfLengthAndType<[8, 16], [BF16]>:$dst); + let assemblyFormat = + "$a attr-dict `:` type($a) `->` type($dst)"; +} + +def CvtNeF32ToBF16Ps256IntrOp : AVX512_IntrOp<"cvtneps2bf16.256", 1, [Pure], + /*extension=*/"bf16"> { + let arguments = (ins VectorOfLengthAndType<[8], [F32]>:$a); + let results = (outs VectorOfLengthAndType<[8], [BF16]>:$res); +} + +def CvtNeF32ToBF16Ps512IntrOp : AVX512_IntrOp<"cvtneps2bf16.512", 1, [Pure], + /*extension=*/"bf16"> { + let arguments = (ins VectorOfLengthAndType<[16], [F32]>:$a); + let results = (outs VectorOfLengthAndType<[16], [BF16]>:$res); +} + //===----------------------------------------------------------------------===// // AVX op definitions //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index d06f10d3137a1..1bdeb3e356f4b 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -2564,7 +2564,7 @@ CompilationTarget TargetOptions::getDefaultCompilationTarget() { } std::pair> -TargetOptions::tokenizeCmdOptions() const { +TargetOptions::tokenizeCmdOptions(const std::string &cmdOptions) { std::pair> options; llvm::StringSaver stringSaver(options.first); StringRef opts = cmdOptions; @@ -2586,6 +2586,11 @@ TargetOptions::tokenizeCmdOptions() const { return options; } +std::pair> +TargetOptions::tokenizeCmdOptions() const { + return tokenizeCmdOptions(cmdOptions); +} + MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions) #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc" diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp index 8dcf6bab127a6..78ff31a75ca4c 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp @@ -58,6 +58,7 @@ void buildCommonPassPipeline( nvvmTargetOptions.chip = options.cubinChip; nvvmTargetOptions.features = options.cubinFeatures; nvvmTargetOptions.optLevel = options.optLevel; + nvvmTargetOptions.cmdOptions = options.cmdOptions; pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions)); pm.addPass(createLowerAffinePass()); pm.addPass(createArithToLLVMConversionPass()); diff --git a/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp index dd705cd338312..a6f7464012f3a 100644 --- a/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/NVVMAttachTarget.cpp @@ -45,7 +45,7 @@ struct NVVMAttachTarget DictionaryAttr NVVMAttachTarget::getFlags(OpBuilder &builder) const { UnitAttr unitAttr = builder.getUnitAttr(); - SmallVector flags; + SmallVector flags; auto addFlag = [&](StringRef flag) { flags.push_back(builder.getNamedAttr(flag, unitAttr)); }; @@ -53,6 +53,22 @@ DictionaryAttr NVVMAttachTarget::getFlags(OpBuilder &builder) const { addFlag("fast"); if (ftzFlag) addFlag("ftz"); + + // Tokenize and set the optional command line options. + if (!cmdOptions.empty()) { + auto options = gpu::TargetOptions::tokenizeCmdOptions(cmdOptions); + if (!options.second.empty()) { + llvm::SmallVector nvvmOptionAttrs; + for (const char *opt : options.second) { + nvvmOptionAttrs.emplace_back( + mlir::StringAttr::get(builder.getContext(), StringRef(opt))); + } + flags.push_back(builder.getNamedAttr( + "ptxas-cmd-options", + mlir::ArrayAttr::get(builder.getContext(), nvvmOptionAttrs))); + } + } + if (!flags.empty()) return builder.getDictionaryAttr(flags); return nullptr; diff --git a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt index ce8dc6ccb0fa3..b4aeb44ac8faf 100644 --- a/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/IR/CMakeLists.txt @@ -13,6 +13,7 @@ add_mlir_dialect_library(MLIRLinalgDialect MLIRLinalgOpsEnumsIncGen MLIRLinalgOpsIncGen MLIRLinalgStructuredOpsIncGen + MLIRLinalgRelayoutOpsIncGen MLIRShardingInterfaceIncGen LINK_LIBS PUBLIC diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp index 9e50c355c5041..c256b18dd2b17 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgDialect.cpp @@ -114,6 +114,10 @@ void mlir::linalg::LinalgDialect::initialize() { #define GET_OP_LIST #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" >(); + addOperations< +#define GET_OP_LIST +#include "mlir/Dialect/Linalg/IR/LinalgRelayoutOps.cpp.inc" + >(); // Fill the Linalg-specific OpName to RegionBuilder map. addNamedOpBuilders< @@ -130,13 +134,22 @@ void mlir::linalg::LinalgDialect::initialize() { >(); declarePromisedInterface(); declarePromisedInterface(); + + // ValueBoundsOpInterface declarePromisedInterface(); - declarePromisedInterface(); + declarePromisedInterface(); + + // Tiling Interface + declarePromisedInterface(); declarePromisedInterfaces(); + declarePromisedInterfaces(); declarePromisedInterfaces { /// 1. The pack op does not have padding value, or /// 2. The filled value and padding value are the same. static FailureOr foldFillPackIntoFillOp(RewriterBase &rewriter, - tensor::PackOp packOp) { + linalg::PackOp packOp) { auto fillOp = packOp.getSource().getDefiningOp(); if (!fillOp) return failure(); @@ -865,12 +866,12 @@ static FailureOr foldFillPackIntoFillOp(RewriterBase &rewriter, } /// Wrapper pattern that applies foldFillPackIntoFillOp method. -struct FoldFillWithPack : public OpRewritePattern { +struct FoldFillWithPack : public OpRewritePattern { public: FoldFillWithPack(MLIRContext *context) - : OpRewritePattern(context) {} + : OpRewritePattern(context) {} - LogicalResult matchAndRewrite(tensor::PackOp packOp, + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { auto fillOp = foldFillPackIntoFillOp(rewriter, packOp); if (failed(fillOp)) @@ -2289,6 +2290,8 @@ LogicalResult IndexOp::verify() { #define GET_OP_CLASSES #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" +#define GET_OP_CLASSES +#include "mlir/Dialect/Linalg/IR/LinalgRelayoutOps.cpp.inc" AffineMap mlir::linalg::extractOrIdentityMap(std::optional maybeMap, unsigned rank, @@ -3429,20 +3432,9 @@ FailureOr WinogradOutputTransformOp::getTiledImplementation( //===----------------------------------------------------------------------===// // LinalgDialect +// TODO: Merge with the LinalgDialect block at the bottom //===----------------------------------------------------------------------===// -void LinalgDialect::getCanonicalizationPatterns( - RewritePatternSet &results) const { - results.add(getContext()); -} - -Operation *LinalgDialect::materializeConstant(OpBuilder &builder, - Attribute value, Type type, - Location loc) { - return arith::ConstantOp::materialize(builder, value, type, loc); -} - // Returns true if the result expression of `subMap` are a subset of `fullMap`. static bool areResultExprsSubsetOf(AffineMap subMap, AffineMap fullMap) { auto explicitRange = subMap.getResults(); @@ -4076,5 +4068,1076 @@ Speculation::Speculatability BatchMatmulOp::getSpeculatability() { return getGenericSpeculatabilityImpl(cast(getOperation())); } +//===----------------------------------------------------------------------===// +// PackOp/UnPackOp Common +//===----------------------------------------------------------------------===// +// Given the (potentially) updated packed type, `newPackedTy`, generates an +// updated mixed-tile-sizes attribute. A tile size is updated only +// when: +// * a dim from newPackedTy is static, and +// * the corresponding size from mixedTiles is still dynamic. +// Otherwise, the original tile size is preserved. +// Note - packed-type-dim and mixed-tile-size should always match! +static SmallVector +getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy, + SmallVector mixedTiles) { + SmallVector newMixedTileSizes; + for (auto it : llvm::zip(cast(newPackedTy) + .getShape() + .take_back(mixedTiles.size()), + mixedTiles)) { + int64_t shape = std::get<0>(it); + if (shape == ShapedType::kDynamic) { + newMixedTileSizes.push_back(std::get<1>(it)); + continue; + } + + // If the current result dim is static, update the dynamic mixed-size + // (provided the original value is dynamic). + OpFoldResult tile = std::get<1>(it); + if (Attribute attr = llvm::dyn_cast_if_present(tile)) { + // Already a constant + newMixedTileSizes.push_back(tile); + } else { + assert(getConstantIntValue(tile).value() == shape && + "tile size and dim size don't match!"); + newMixedTileSizes.push_back( + (rewriter.getIntegerAttr(rewriter.getIndexType(), shape))); + } + } + + return newMixedTileSizes; +} + +template +static LogicalResult +reifyResultShapesImpl(OpTy op, OpBuilder &builder, + ReifiedRankedShapedTypeDims &reifiedReturnShapes) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + int64_t destRank = op.getDestRank(); + reifiedReturnShapes.resize(1, SmallVector(destRank)); + reifiedReturnShapes[0] = + tensor::getMixedSizes(builder, op.getLoc(), op.getDest()); + return success(); +} + +template +static DenseMap getDimAndTileMappingImpl(OpTy op) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + DenseMap dimAndTileMapping; + ArrayRef dimsToTile = op.getInnerDimsPos(); + SmallVector tiles = op.getMixedTiles(); + assert(tiles.size() == dimsToTile.size() && + "tiles must match indices of dimension to block"); + // bind the dimension `i` with the tile factor. + for (auto i : llvm::seq(0, dimsToTile.size())) + dimAndTileMapping[dimsToTile[i]] = tiles[i]; + return dimAndTileMapping; +} + +template +static SmallVector getMixedTilesImpl(OpTy op) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + Builder builder(op); + SmallVector mixedInnerTiles; + unsigned dynamicValIndex = 0; + for (int64_t staticTile : op.getStaticInnerTiles()) { + if (!ShapedType::isDynamic(staticTile)) + mixedInnerTiles.push_back(builder.getI64IntegerAttr(staticTile)); + else + mixedInnerTiles.push_back(op.getInnerTiles()[dynamicValIndex++]); + } + return mixedInnerTiles; +} + +template +static SmallVector getStaticTilesImpl(OpTy op) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + SmallVector dynamicTiles; + SmallVector staticTiles; + dispatchIndexOpFoldResults(op.getMixedTiles(), dynamicTiles, staticTiles); + return staticTiles; +} + +/// Returns true if `dimsPos` is invalid. It is invalid when: +/// a) It contains duplicate. +/// b) At least one dimension is out of bound (`dimPos` is >= 0 and < rank). +/// c) The number of elements in `dimsPos` is > than `rank`. +static bool isInvalidPackingPosSpecification(ArrayRef dimsPos, + size_t rank) { + size_t dimsPosSize = dimsPos.size(); + if (dimsPosSize > rank) + return true; + DenseSet uniqued; + for (int64_t dim : dimsPos) + uniqued.insert(dim); + if (dimsPosSize != uniqued.size()) + return true; + return llvm::any_of(dimsPos, [rank](int64_t dimPos) { + return dimPos < 0 || dimPos >= static_cast(rank); + }); +} + +/// Returns true if the dimension of `sourceShape` is smaller than the dimension +/// of the `limitShape`. +static bool areAllInBound(ArrayRef sourceShape, + ArrayRef limitShape) { + assert( + sourceShape.size() == limitShape.size() && + "expected source shape rank, and limit of the shape to have same rank"); + return llvm::all_of( + llvm::zip(sourceShape, limitShape), [](std::tuple it) { + int64_t sourceExtent = std::get<0>(it); + int64_t limit = std::get<1>(it); + return ShapedType::isDynamic(sourceExtent) || + ShapedType::isDynamic(limit) || sourceExtent <= limit; + }); +} + +template +static LogicalResult commonVerifierPackAndUnPackOp(OpTy packOrUnPack) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + Operation *op = packOrUnPack.getOperation(); + + // Return true if we have a zero-value tile. + auto hasZeros = [&](ArrayRef tiles) { + return llvm::any_of( + tiles, [](OpFoldResult tile) { return isConstantIntValue(tile, 0); }); + }; + + // Verify tiles. Do not allow zero tiles. + SmallVector mixedTiles = packOrUnPack.getMixedTiles(); + if (hasZeros(mixedTiles)) + return op->emitError("invalid zero tile factor"); + + // Verify inner_dims_pos and outer_dims_perm. + RankedTensorType unpackedType = (std::is_same::value) + ? packOrUnPack.getSourceType() + : packOrUnPack.getDestType(); + size_t unpackedRank = unpackedType.getRank(); + ArrayRef innerDimsPos = packOrUnPack.getInnerDimsPos(); + ArrayRef outerDimPerm = packOrUnPack.getOuterDimsPerm(); + if (isInvalidPackingPosSpecification(innerDimsPos, unpackedRank)) + return op->emitError("invalid inner_dims_pos vector"); + if (isInvalidPackingPosSpecification(outerDimPerm, unpackedRank)) + return op->emitError("invalid outer_dims_perm vector"); + if (!outerDimPerm.empty() && outerDimPerm.size() != unpackedRank) + return op->emitError("outer_dims_perm must be a permutation or empty"); + + // Tiling factors must be less than or equal to the input rank for pack (or + // output rank for unpack), and must match the number of `inner_dims_pos`. + if (mixedTiles.size() > unpackedRank) { + return op->emitError("tiling factors must be less than or equal to the " + "input rank for pack or output rank for unpack"); + } + if (mixedTiles.size() != innerDimsPos.size()) { + return op->emitError( + "tiling factors must equal the number of dimensions to tile"); + } + + ShapedType packedType = (std::is_same::value) + ? packOrUnPack.getDestType() + : packOrUnPack.getSourceType(); + size_t packedRank = packedType.getRank(); + // Require output rank to match input rank + number of blocking factors. + size_t expectedPackedRank = unpackedRank + mixedTiles.size(); + if (expectedPackedRank != packedRank) { + return op->emitError( + "packed rank != (unpacked rank + num tiling factors), got ") + << packedRank << " != " << expectedPackedRank; + } + + // Verify result shape is greater than the minimum expected + // by the pack operation, and that the output shape + // represents full tiles. + RankedTensorType expectedPackedType = PackOp::inferPackedType( + unpackedType, packOrUnPack.getStaticTiles(), innerDimsPos, outerDimPerm); + if (!areAllInBound(expectedPackedType.getShape(), packedType.getShape())) { + return op->emitError("the shape of output is not large enough to hold the " + "packed data. Expected at least ") + << expectedPackedType << ", got " << packedType; + } + if (!llvm::all_of( + llvm::zip(packedType.getShape().take_back(mixedTiles.size()), + mixedTiles), + [](std::tuple it) { + int64_t shape = std::get<0>(it); + if (Attribute attr = + llvm::dyn_cast_if_present(std::get<1>(it))) { + IntegerAttr intAttr = dyn_cast_or_null(attr); + int64_t staticTileSize = intAttr.getValue().getSExtValue(); + return shape == staticTileSize; + } + return ShapedType::isDynamic(shape); + })) { + return op->emitError("mismatch in inner tile sizes specified and shaped of " + "tiled dimension in the packed type"); + } + return success(); +} + +namespace { +/// Subset of PackOp/UnPackOp fields used to compute the result of applying +/// various permutations to the op. +// TODO: Add linalg.transpose + pack/unpack folding patterns that just reuse +// these. These may or may not become true foldings / canonicalizations +// depending on how aggressive we want to be in automatically folding +// transposes. +struct PackOrUnPackTransposeResult { + SmallVector innerDimsPos; + SmallVector innerTiles; + SmallVector outerDimsPerm; +}; +} // namespace + +template +static PackOrUnPackTransposeResult +commonPermutationOfPackAndUnPackOp(OpTy packOrUnPackOp, + ArrayRef innerPermutation, + ArrayRef outerPermutation) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + assert((!innerPermutation.empty() || !outerPermutation.empty()) && + "some permutation must be non-empty"); + PackOrUnPackTransposeResult metadata; + metadata.innerDimsPos = + SmallVector(packOrUnPackOp.getInnerDimsPos()); + metadata.innerTiles = + SmallVector(packOrUnPackOp.getMixedTiles()); + int64_t numOuterDims = std::is_same::value + ? packOrUnPackOp.getSourceRank() + : packOrUnPackOp.getDestRank(); + metadata.outerDimsPerm = + packOrUnPackOp.getOuterDimsPerm().empty() + ? llvm::to_vector(llvm::seq(0, numOuterDims)) + : SmallVector(packOrUnPackOp.getOuterDimsPerm()); + if (!innerPermutation.empty()) { + assert(innerPermutation.size() == metadata.innerDimsPos.size() && + isPermutationVector(innerPermutation) && + "invalid inner permutation"); + applyPermutationToVector(metadata.innerDimsPos, innerPermutation); + applyPermutationToVector(metadata.innerTiles, innerPermutation); + } + if (!outerPermutation.empty()) { + assert(outerPermutation.size() == metadata.outerDimsPerm.size() && + isPermutationVector(outerPermutation) && + "invalid outer permutation"); + applyPermutationToVector(metadata.outerDimsPerm, outerPermutation); + } + return metadata; +} + +//===----------------------------------------------------------------------===// +// PackOp +//===----------------------------------------------------------------------===// + +void PackOp::getAsmResultNames(function_ref setNameFn) { + setNameFn(getResult(), "pack"); +} + +void PackOp::build(OpBuilder &builder, OperationState &state, Value source, + Value dest, ArrayRef innerDimsPos, + ArrayRef innerTiles, + std::optional paddingValue, + ArrayRef outerDimsPerm) { + assert(innerDimsPos.size() == innerTiles.size() && + "number of tile sizes specified must match the specified number of " + "original dimensions to be tiled"); + SmallVector staticTileSizes; + SmallVector dynamicTileSizes; + dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes); + build(builder, state, dest.getType(), source, dest, + paddingValue ? *paddingValue : nullptr, + outerDimsPerm.empty() ? nullptr + : builder.getDenseI64ArrayAttr(outerDimsPerm), + builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, + builder.getDenseI64ArrayAttr(staticTileSizes)); +} + +LogicalResult +PackOp::reifyResultShapes(OpBuilder &builder, + ReifiedRankedShapedTypeDims &reifiedReturnShapes) { + return reifyResultShapesImpl(*this, builder, reifiedReturnShapes); +} + +DenseMap PackOp::getDimAndTileMapping() { + return getDimAndTileMappingImpl(*this); +} + +SmallVector PackOp::getMixedTiles() { + return getMixedTilesImpl(*this); +} + +SmallVector PackOp::getStaticTiles() { + return getStaticTilesImpl(*this); +} + +ArrayRef PackOp::getAllOuterDims() { + ShapedType inputType = getSourceType(); + int64_t inputRank = inputType.getRank(); + return getDestType().getShape().take_front(inputRank); +} + +SmallVector PackOp::getTiledOuterDims() { + auto innerDimsPos = getInnerDimsPos(); + auto packedShape = getDestType().getShape(); + SmallVector res; + + for (auto index : innerDimsPos) + res.push_back(packedShape[index]); + + return res; +} + +bool PackOp::requirePaddingValue(ArrayRef inputShape, + ArrayRef innerDimsPos, + ArrayRef outputShape, + ArrayRef outerDimsPerm, + ArrayRef innerTiles) { + SmallVector outputTileSizes( + outputShape.take_front(inputShape.size())); + if (!outerDimsPerm.empty()) { + assert(outerDimsPerm.size() == outputTileSizes.size() && + "expected output and outer_dims_perm to have same size"); + applyPermutationToVector(outputTileSizes, + invertPermutationVector(outerDimsPerm)); + } + for (auto [pos, tileSize] : llvm::zip_equal(innerDimsPos, innerTiles)) { + if (ShapedType::isDynamic(inputShape[pos])) + continue; + std::optional constantTile = getConstantIntValue(tileSize); + + if (!constantTile) { + if (!ShapedType::isDynamic(outputTileSizes[pos]) && + (inputShape[pos] % outputTileSizes[pos] != 0)) + return true; + } else if (inputShape[pos] % (*constantTile) != 0) { + return true; + } + } + return false; +} + +LogicalResult PackOp::verify() { + if (failed(commonVerifierPackAndUnPackOp(*this))) + return failure(); + + // Verify padding value, and bail out if the tile does not divide the + // dimension fully. In the case of dynamic tile factors or dimensions, having + // a partial tile is undefined behavior. + auto paddingValue = getPaddingValue(); + if (paddingValue && + paddingValue.getType() != getSourceType().getElementType()) { + return emitOpError("expected padding_value has ") + << getSourceType().getElementType() + << " but got: " << paddingValue.getType(); + } + + if (!paddingValue && + requirePaddingValue(getSourceType().getShape(), getInnerDimsPos(), + getDestType().getShape(), getOuterDimsPerm(), + getMixedTiles())) { + return emitOpError( + "invalid tile factor or output size provided. Only full tiles are " + "supported when padding_value is not set"); + } + return success(); +} + +/// Converts OpFoldResults to int64_t shape entries, unconditionally mapping all +/// Value's to kDynamic, even if they are arith.constant values. +static SmallVector +asShapeWithAnyValueAsDynamic(ArrayRef ofrs) { + SmallVector result; + for (auto o : ofrs) { + // Have to do this first, as getConstantIntValue special-cases constants. + if (llvm::dyn_cast_if_present(o)) + result.push_back(ShapedType::kDynamic); + else + result.push_back(getConstantIntValue(o).value_or(ShapedType::kDynamic)); + } + return result; +} + +/// Helper for PackOp::{getResultShape,inferPackedType}. Returns the shape of +/// the packed type. Having a shared helper helps implement these two methods in +/// a way that ensures that they agree on which dimensions are dynamic. +static SmallVector getPackOpResultTypeShape( + ArrayRef sourceShape, ArrayRef innerTileSizes, + ArrayRef innerDimsPos, ArrayRef outerDimsPerm) { + SmallVector resultShape = llvm::to_vector(sourceShape); + for (auto tiledDim : llvm::enumerate(llvm::to_vector(innerDimsPos))) { + if (ShapedType::isDynamic(resultShape[tiledDim.value()])) + continue; + if (ShapedType::isDynamic(innerTileSizes[tiledDim.index()])) { + resultShape[tiledDim.value()] = ShapedType::kDynamic; + continue; + } + resultShape[tiledDim.value()] = llvm::divideCeilSigned( + resultShape[tiledDim.value()], innerTileSizes[tiledDim.index()]); + } + + // Swap tile loops if outer_dims_perm is available. + if (!outerDimsPerm.empty()) + applyPermutationToVector(resultShape, outerDimsPerm); + + // Append the inner tile dimensions. + resultShape.append(innerTileSizes.begin(), innerTileSizes.end()); + return resultShape; +} + +SmallVector PackOp::getResultShape( + OpBuilder &builder, Location loc, ArrayRef sourceDims, + ArrayRef innerTileSizes, ArrayRef innerDimsPos, + ArrayRef outerDimsPerm) { + SmallVector resultDims = llvm::to_vector(sourceDims); + + AffineExpr s0, s1; + bindSymbols(builder.getContext(), s0, s1); + AffineExpr ceilDivExpr = s0.ceilDiv(s1); + for (auto tiledDim : llvm::enumerate(llvm::to_vector(innerDimsPos))) { + resultDims[tiledDim.value()] = affine::makeComposedFoldedAffineApply( + builder, loc, ceilDivExpr, + {resultDims[tiledDim.value()], innerTileSizes[tiledDim.index()]}); + } + if (!outerDimsPerm.empty()) + applyPermutationToVector(resultDims, outerDimsPerm); + resultDims.append(innerTileSizes.begin(), innerTileSizes.end()); + + SmallVector resultTypeShape = + getPackOpResultTypeShape(asShapeWithAnyValueAsDynamic(sourceDims), + asShapeWithAnyValueAsDynamic(innerTileSizes), + innerDimsPos, outerDimsPerm); + + // Fix-up `resultDims` to ensure that they are Value's if and only if the + // result type shape says it's a dynamic dim. This is needed as callers may + // use dispatchIndexOpFoldResults on the result, and rely on exact number of + // dynamic dims returned by that. + for (unsigned i = 0; i < resultDims.size(); ++i) { + if (!ShapedType::isDynamic(resultTypeShape[i])) + continue; + resultDims[i] = + getValueOrCreateConstantIndexOp(builder, loc, resultDims[i]); + } + + return resultDims; +} + +/// Get the expected packed type based on source type, tile factors, position of +/// the inner tiles and permutation of the outer tiled loop. +RankedTensorType PackOp::inferPackedType(RankedTensorType sourceType, + ArrayRef innerTileSizes, + ArrayRef innerDimsPos, + ArrayRef outerDimsPerm) { + SmallVector resultShape = getPackOpResultTypeShape( + sourceType.getShape(), innerTileSizes, innerDimsPos, outerDimsPerm); + return RankedTensorType::get(resultShape, sourceType.getElementType()); +} + +Value PackOp::createDestinationTensor(OpBuilder &b, Location loc, Value source, + ArrayRef innerTileSizes, + ArrayRef innerDimsPos, + ArrayRef outerDimsPerm) { + AffineExpr dim0, dim1; + bindDims(b.getContext(), dim0, dim1); + auto ceilDiv = [&](OpFoldResult v1, OpFoldResult v2) -> OpFoldResult { + return affine::makeComposedFoldedAffineApply(b, loc, dim0.ceilDiv(dim1), + {v1, v2}); + }; + + SmallVector mixedSizes; + for (auto [index, value] : llvm::enumerate( + llvm::cast(source.getType()).getShape())) { + if (ShapedType::isDynamic(value)) + mixedSizes.push_back( + b.create(loc, source, index).getResult()); + else + mixedSizes.push_back(b.getIndexAttr(value)); + } + for (auto it : llvm::zip(innerDimsPos, innerTileSizes)) { + int64_t dimPos = std::get<0>(it); + OpFoldResult tileSize = std::get<1>(it); + mixedSizes[dimPos] = ceilDiv(mixedSizes[dimPos], tileSize); + } + if (!outerDimsPerm.empty()) + applyPermutationToVector(mixedSizes, outerDimsPerm); + + mixedSizes.append(innerTileSizes.begin(), innerTileSizes.end()); + auto elemType = llvm::cast(source.getType()).getElementType(); + return b.create(loc, mixedSizes, elemType); +} + +PackOp PackOp::createTransposedClone(OpBuilder &b, Location loc, + ArrayRef innerPermutation, + ArrayRef outerPermutation) { + PackOrUnPackTransposeResult metadata = commonPermutationOfPackAndUnPackOp( + *this, innerPermutation, outerPermutation); + Value transposedDest = + createDestinationTensor(b, loc, getSource(), metadata.innerTiles, + metadata.innerDimsPos, metadata.outerDimsPerm); + return b.create(loc, getSource(), transposedDest, + metadata.innerDimsPos, metadata.innerTiles, + getPaddingValue(), metadata.outerDimsPerm); +} + +/// Returns true if the tiles and the tiled dims are constant. +template +bool areTilesAndTiledDimsAllConstant(OpTy op) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + ShapedType packedType = (std::is_same::value) + ? op.getDestType() + : op.getSourceType(); + SmallVector mixedTiles = op.getMixedTiles(); + for (auto [dimDest, tile] : llvm::zip( + packedType.getShape().take_back(mixedTiles.size()), mixedTiles)) { + std::optional constTileSize = getConstantIntValue(tile); + if (!constTileSize || ShapedType::isDynamic(dimDest)) + return false; + } + return true; +} + +Speculation::Speculatability PackOp::getSpeculatability() { + if (getPaddingValue()) + return Speculation::Speculatable; + + // The verifier rejects already operations if we can statically prove that the + // sizes of the tiles do not divide perfectly the dimension; thus, check only + // to have constant tiles and tiled inner dimensions. + if (!areTilesAndTiledDimsAllConstant(*this)) + return Speculation::NotSpeculatable; + + return Speculation::Speculatable; +} + +// Return true if `inner_dims_pos` and `outer_dims_perm` target the same +// dimensions for pack and unpack. +static bool hasSameInnerOuterAttribute(PackOp packOp, UnPackOp unPackOp) { + if (packOp.getInnerDimsPos() != unPackOp.getInnerDimsPos()) + return false; + if (packOp.getOuterDimsPerm() == unPackOp.getOuterDimsPerm()) + return true; + // Outer dims permutation is optional. + // To compare unbalanced pack-unpack pair, treat no permutation as equal to + // identity permutation. + return isIdentityPermutation(packOp.getOuterDimsPerm()) && + isIdentityPermutation(unPackOp.getOuterDimsPerm()); +} + +// Return true if pack and unpack have the same tiles. +// Same SSA values or same integer constants. +static bool haveSameTiles(PackOp packOp, UnPackOp unPackOp) { + auto packTiles = packOp.getMixedTiles(); + auto unPackTiles = unPackOp.getMixedTiles(); + if (packTiles.size() != unPackTiles.size()) + return false; + for (size_t i = 0, e = packTiles.size(); i < e; i++) { + if (!isEqualConstantIntOrValue(packTiles[i], unPackTiles[i])) + return false; + } + return true; +} + +/// Returns true if the pack op does not need a padding value. +static bool paddingIsNotNeeded(PackOp op) { + auto srcType = op.getSourceType(); + if (llvm::any_of(op.getInnerDimsPos(), + [&](int64_t pos) { return srcType.isDynamicDim(pos); })) + return false; + if (ShapedType::isDynamicShape(op.getStaticInnerTiles())) + return false; + return !PackOp::requirePaddingValue( + srcType.getShape(), op.getInnerDimsPos(), op.getDestType().getShape(), + op.getOuterDimsPerm(), op.getMixedTiles()); +} + +/// Returns true if the `srcShape` or `destShape` is different from the one in +/// `packOp` and populates each with the inferred static shape. +static bool inferStaticShape(PackOp packOp, SmallVectorImpl &srcShape, + SmallVectorImpl &destShape) { + bool changeNeeded = false; + srcShape.assign(packOp.getSourceType().getShape().begin(), + packOp.getSourceType().getShape().end()); + destShape.assign(packOp.getDestType().getShape().begin(), + packOp.getDestType().getShape().end()); + llvm::SmallSetVector innerDims; + innerDims.insert(packOp.getInnerDimsPos().begin(), + packOp.getInnerDimsPos().end()); + SmallVector inverseOuterDimsPerm; + if (!packOp.getOuterDimsPerm().empty()) + inverseOuterDimsPerm = invertPermutationVector(packOp.getOuterDimsPerm()); + int srcRank = packOp.getSourceRank(); + for (auto i : llvm::seq(0, srcRank)) { + if (innerDims.contains(i)) + continue; + int64_t srcPos = i; + int64_t destPos = i; + if (!inverseOuterDimsPerm.empty()) + destPos = inverseOuterDimsPerm[srcPos]; + if (ShapedType::isDynamic(srcShape[srcPos]) == + ShapedType::isDynamic(destShape[destPos])) { + continue; + } + int64_t size = srcShape[srcPos]; + if (ShapedType::isDynamic(size)) + size = destShape[destPos]; + srcShape[srcPos] = size; + destShape[destPos] = size; + changeNeeded = true; + } + return changeNeeded; +} + +LogicalResult PackOp::canonicalize(PackOp packOp, PatternRewriter &rewriter) { + // Fold an pack(unpack(x)) to x. + if (auto unPackOp = packOp.getSource().getDefiningOp()) { + if (unPackOp.getSourceType() != packOp.getDestType()) + return failure(); + if (packOp.getPaddingValue() || + !hasSameInnerOuterAttribute(packOp, unPackOp) || + !haveSameTiles(packOp, unPackOp)) + return failure(); + rewriter.replaceOp(packOp, unPackOp.getSource()); + return success(); + } + + // Fold optional PaddingValue operand away if padding is not needed. + if (packOp.getPaddingValue() && paddingIsNotNeeded(packOp)) { + rewriter.startOpModification(packOp); + packOp.getPaddingValueMutable().clear(); + rewriter.finalizeOpModification(packOp); + return success(); + } + + // Insert tensor.cast ops if static shape inference is available.. + SmallVector srcShape, destShape; + if (inferStaticShape(packOp, srcShape, destShape)) { + Location loc = packOp.getLoc(); + Value source = packOp.getSource(); + if (srcShape != packOp.getSourceType().getShape()) { + auto newSrcType = packOp.getSourceType().clone(srcShape); + source = + rewriter.create(loc, newSrcType, packOp.getSource()); + } + Value dest = packOp.getDest(); + RankedTensorType originalResultType = packOp.getDestType(); + bool needUpdateDestType = (destShape != originalResultType.getShape()); + if (needUpdateDestType) { + auto newDestType = packOp.getDestType().clone(destShape); + dest = + rewriter.create(loc, newDestType, packOp.getDest()); + } + rewriter.modifyOpInPlace(packOp, [&] { + packOp.getSourceMutable().assign(source); + packOp.getDestMutable().assign(dest); + packOp.getResult().setType(cast(dest.getType())); + }); + // Insert a cast if needed + if (needUpdateDestType) { + rewriter.setInsertionPointAfter(packOp); + auto castOp = + rewriter.create(loc, originalResultType, packOp); + rewriter.replaceAllUsesExcept(packOp, castOp, castOp); + } + return success(); + } + + return failure(); +} + +template +static bool isLikePadUnPad(PackOrUnpackOp packOp, + RankedTensorType packedTensorType) { + static_assert(std::is_same::value || + std::is_same::value, + "Function meant for pack/unpack"); + // This is a pad if packing only adds ones and we don't transpose dimensions. + + // Check that we are not transposing any dimensions. + ArrayRef innerDimsPos = packOp.getInnerDimsPos(); + int64_t numPackedDims = innerDimsPos.size(); + auto orderedDims = llvm::to_vector<4>(llvm::seq(0, numPackedDims)); + if (orderedDims != innerDimsPos) { + // Dimensions don't happen in order. + return false; + } + + ArrayRef packedShape = packedTensorType.getShape(); + int64_t packedRank = packedTensorType.getRank(); + // At this point we know that we are taking numPackedDims outer + // dimensions and pushing them all the way as the inner most dimensions. + // What's left on the outer most dimensions is, in this order: + // - the factor of the packed dimensions, then + // - the untouched dimensions + // This shifting inward of dimensions is a no-op (as opposed to a transpose) + // if all the dimensions that bubble outerward are ones. + // Therefore check that all the dimensions but the numPackedDims inner most + // ones are ones. + return llvm::all_of( + llvm::seq(0, packedRank - numPackedDims), + [&packedShape](int64_t i) { return packedShape[i] == 1; }); +} + +bool PackOp::isLikePad() { + auto packedTensorType = + llvm::cast((*this)->getResultTypes().front()); + return isLikePadUnPad(*this, packedTensorType); +} + +OpFoldResult PackOp::fold(FoldAdaptor adaptor) { + std::optional paddingValue; + if (auto pad = adaptor.getPaddingValue()) + paddingValue = pad; + if (OpFoldResult reshapedSource = reshapeConstantSource( + llvm::dyn_cast_if_present(adaptor.getSource()), + getDestType(), paddingValue)) + return reshapedSource; + return {}; +} + +/// Folds a tensor.cast op into a consuming PackOp op if the +/// `tensor.cast` has source that is more static than the consuming op. +/// +/// Example: +/// ```mlir +/// %1 = tensor.cast %0 : tensor<8x16xf32> to tensor +/// %2 = tensor.pack %1 ... : tensor ... +/// ``` +/// +/// folds into: +/// +/// ```mlir +/// %2 = tensor.pack %0 ... : tensor<8x16xf32> ... +/// ``` +struct FoldTensorCastPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(PackOp op, + PatternRewriter &rewriter) const override { + if (!tensor::hasFoldableTensorCastOperand(op)) + return failure(); + + SmallVector newResultTypes(op->getResultTypes()); + SmallVector newOperands = + tensor::getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); + + // Get the updated mixed-tile-sizes attribute. + SmallVector newMixedTileSizes = + getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles()); + + // Clone op. + // TODO: Strictly speaking, discardable attributes should be _discarded_ at + // this point. However, in practice, we use them for things that we'd like + // to preserve. Implement a better abstraction. + PackOp newOp = rewriter.create( + op.getLoc(), newOperands[0], newOperands[1], op.getInnerDimsPos(), + newMixedTileSizes, op.getPaddingValue(), op.getOuterDimsPerm()); + newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); + + // Replace op. + Value oldResult = op.getResult(); + Value newResult = newOp.getResult(); + Value replacement = (newResult.getType() != oldResult.getType()) + ? rewriter.create( + op->getLoc(), oldResult.getType(), newResult) + : newResult; + + rewriter.replaceOp(op, {replacement}); + + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// UnPackOp +//===----------------------------------------------------------------------===// + +void UnPackOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), "unpack"); +} + +LogicalResult +UnPackOp::reifyResultShapes(OpBuilder &builder, + ReifiedRankedShapedTypeDims &reifiedReturnShapes) { + return reifyResultShapesImpl(*this, builder, reifiedReturnShapes); +} + +DenseMap UnPackOp::getDimAndTileMapping() { + return getDimAndTileMappingImpl(*this); +} + +SmallVector UnPackOp::getMixedTiles() { + return getMixedTilesImpl(*this); +} + +SmallVector UnPackOp::getStaticTiles() { + return getStaticTilesImpl(*this); +} + +ArrayRef UnPackOp::getAllOuterDims() { + ShapedType destType = getDestType(); + int64_t destRank = destType.getRank(); + return getSourceType().getShape().take_front(destRank); +} + +SmallVector UnPackOp::getTiledOuterDims() { + auto innerDimsPos = getInnerDimsPos(); + auto packedShape = getSourceType().getShape(); + SmallVector res; + + for (auto index : innerDimsPos) + res.push_back(packedShape[index]); + + return res; +} + +LogicalResult UnPackOp::verify() { + return commonVerifierPackAndUnPackOp(*this); +} + +Speculation::Speculatability UnPackOp::getSpeculatability() { + // See PackOp::getSpeculatability. + if (!areTilesAndTiledDimsAllConstant(*this)) + return Speculation::NotSpeculatable; + + return Speculation::Speculatable; +} + +void UnPackOp::build(OpBuilder &builder, OperationState &state, Value source, + Value dest, ArrayRef innerDimsPos, + ArrayRef innerTiles, + ArrayRef outerDimsPerm) { + assert(innerDimsPos.size() == innerTiles.size() && + "number of tile sizes specified must match the specified number of " + "original dimensions to be tiled"); + SmallVector staticTileSizes; + SmallVector dynamicTileSizes; + dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes); + build(builder, state, dest.getType(), source, dest, + outerDimsPerm.empty() ? nullptr + : builder.getDenseI64ArrayAttr(outerDimsPerm), + builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, + builder.getDenseI64ArrayAttr(staticTileSizes)); +} + +Value UnPackOp::createDestinationTensor(OpBuilder &b, Location loc, + Value source, + ArrayRef innerTileSizes, + ArrayRef innerDimsPos, + ArrayRef outerDimsPerm) { + AffineExpr sym0, sym1; + bindSymbols(b.getContext(), sym0, sym1); + auto dimMul = [&](OpFoldResult v1, OpFoldResult v2) -> OpFoldResult { + return affine::makeComposedFoldedAffineApply(b, loc, sym0 * sym1, {v1, v2}); + }; + + SmallVector mixedSizes; + auto srcType = llvm::cast(source.getType()); + for (auto i : + llvm::seq(0, srcType.getRank() - innerTileSizes.size())) { + if (srcType.isDynamicDim(i)) + mixedSizes.push_back(b.create(loc, source, i).getResult()); + else + mixedSizes.push_back(b.getIndexAttr(srcType.getDimSize(i))); + } + if (!outerDimsPerm.empty()) { + applyPermutationToVector( + mixedSizes, invertPermutationVector(outerDimsPerm)); + } + + for (auto [dimPos, tileSize] : llvm::zip_equal(innerDimsPos, innerTileSizes)) + mixedSizes[dimPos] = dimMul(mixedSizes[dimPos], tileSize); + + auto elemType = srcType.getElementType(); + return b.create(loc, mixedSizes, elemType); +} + +UnPackOp UnPackOp::createTransposedClone(OpBuilder &b, Location loc, + Value transposedSource, + ArrayRef innerPermutation, + ArrayRef outerPermutation) { + PackOrUnPackTransposeResult metadata = commonPermutationOfPackAndUnPackOp( + *this, innerPermutation, outerPermutation); + return b.create(loc, transposedSource, getDest(), + metadata.innerDimsPos, metadata.innerTiles, + metadata.outerDimsPerm); +} + +/// Returns true if the `srcShape` or `destShape` is different from the one in +/// `op` and populates each with the inferred static shape. +static bool inferStaticShape(UnPackOp op, SmallVectorImpl &srcShape, + SmallVectorImpl &destShape) { + bool changeNeeded = false; + srcShape.assign(op.getSourceType().getShape().begin(), + op.getSourceType().getShape().end()); + destShape.assign(op.getDestType().getShape().begin(), + op.getDestType().getShape().end()); + llvm::SmallSetVector innerDims; + innerDims.insert(op.getInnerDimsPos().begin(), op.getInnerDimsPos().end()); + SmallVector inverseOuterDimsPerm; + if (!op.getOuterDimsPerm().empty()) + inverseOuterDimsPerm = invertPermutationVector(op.getOuterDimsPerm()); + int destRank = op.getDestRank(); + for (auto i : llvm::seq(0, destRank)) { + if (innerDims.contains(i)) + continue; + int64_t srcPos = i; + int64_t destPos = i; + if (!inverseOuterDimsPerm.empty()) + srcPos = inverseOuterDimsPerm[destPos]; + if (ShapedType::isDynamic(srcShape[srcPos]) == + ShapedType::isDynamic(destShape[destPos])) { + continue; + } + int64_t size = srcShape[srcPos]; + if (ShapedType::isDynamic(size)) + size = destShape[destPos]; + srcShape[srcPos] = size; + destShape[destPos] = size; + changeNeeded = true; + } + return changeNeeded; +} + +LogicalResult UnPackOp::canonicalize(UnPackOp unPackOp, + PatternRewriter &rewriter) { + /// unpack(pack(x)) -> x + if (PackOp packOp = unPackOp.getSource().getDefiningOp()) { + if (packOp.getSourceType() != unPackOp.getDestType()) + return failure(); + if (packOp.getPaddingValue() || + !hasSameInnerOuterAttribute(packOp, unPackOp) || + !haveSameTiles(packOp, unPackOp)) + return failure(); + rewriter.replaceOp(unPackOp, packOp.getSource()); + return success(); + } + /// unpack(destinationStyleOp(x)) -> unpack(x) + if (auto dstStyleOp = + unPackOp.getDest().getDefiningOp()) { + auto destValue = cast(unPackOp.getDest()); + Value newDest = dstStyleOp.getDpsInits()[destValue.getResultNumber()]; + rewriter.modifyOpInPlace(unPackOp, + [&]() { unPackOp.setDpsInitOperand(0, newDest); }); + return success(); + } + + // Insert tensor.cast ops if static shape inference is available.. + SmallVector srcShape, destShape; + if (inferStaticShape(unPackOp, srcShape, destShape)) { + Location loc = unPackOp.getLoc(); + Value source = unPackOp.getSource(); + if (srcShape != unPackOp.getSourceType().getShape()) { + auto newSrcType = unPackOp.getSourceType().clone(srcShape); + source = rewriter.create(loc, newSrcType, + unPackOp.getSource()); + } + Value dest = unPackOp.getDest(); + if (destShape != unPackOp.getDestType().getShape()) { + auto newDestType = unPackOp.getDestType().clone(destShape); + dest = + rewriter.create(loc, newDestType, unPackOp.getDest()); + } + Value newOp = rewriter.create( + loc, source, dest, unPackOp.getInnerDimsPos(), unPackOp.getMixedTiles(), + unPackOp.getOuterDimsPerm()); + rewriter.replaceOpWithNewOp( + unPackOp, unPackOp.getResult().getType(), newOp); + return success(); + } + + return failure(); +} + +bool UnPackOp::isLikeUnPad() { + RankedTensorType packedTensorType = getSourceType(); + return isLikePadUnPad(*this, packedTensorType); +} + +OpFoldResult UnPackOp::fold(FoldAdaptor adaptor) { + if (OpFoldResult reshapedSource = reshapeConstantSource( + llvm::dyn_cast_if_present(adaptor.getSource()), + getResult().getType())) + return reshapedSource; + return {}; +} + +/// Folds a tensor.cast op into a consuming UnPackOp op if the +/// `tensor.cast` has source that is more static than the consuming op. +/// +/// Example: +/// ```mlir +/// %1 = tensor.cast %0 : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> +/// %2 = tensor.unpack %1 ... : tensor<1x1x?x1xi32> -> tensor<7x?xi32> +/// ``` +/// +/// folds into: +/// +/// ```mlir +/// %2 = tensor.unpack %0 ... tensor<1x1x8x1xi32> -> tensor<7x?xi32> +/// ``` +struct FoldTensorCastUnPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(UnPackOp op, + PatternRewriter &rewriter) const override { + if (!tensor::hasFoldableTensorCastOperand(op)) + return failure(); + + SmallVector newResultTypes(op->getResultTypes()); + SmallVector newOperands = + tensor::getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); + Value sourceTensor = newOperands[0]; + + // Get the updated mixed-tile-sizes attribute. + SmallVector newMixedTileSizes = getNewMixedTileSizes( + rewriter, sourceTensor.getType(), op.getMixedTiles()); + + // Clone op. + // TODO: Strictly speaking, discardable attributes should be _discarded_ at + // this point. However, in practice, we use them for things that we'd like + // to preserve. Implement a better abstraction. + UnPackOp newOp = rewriter.create( + op.getLoc(), sourceTensor, newOperands[1], op.getInnerDimsPos(), + newMixedTileSizes, op.getOuterDimsPerm()); + newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); + + // Replace op. + Value oldResult = op.getResult(); + Value newResult = newOp.getResult(); + Value replacement = (newResult.getType() != oldResult.getType()) + ? rewriter.create( + op->getLoc(), oldResult.getType(), newResult) + : newResult; + + rewriter.replaceOp(op, {replacement}); + + return success(); + } +}; + } // namespace linalg } // namespace mlir + +//===----------------------------------------------------------------------===// +// LinalgDialect +//===----------------------------------------------------------------------===// + +void LinalgDialect::getCanonicalizationPatterns( + RewritePatternSet &results) const { + results.add(getContext()); +} + +Operation *LinalgDialect::materializeConstant(OpBuilder &builder, + Attribute value, Type type, + Location loc) { + return arith::ConstantOp::materialize(builder, value, type, loc); +} diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 51d1df52598c7..2f54e780093a2 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -267,6 +267,16 @@ void transform::ApplyPadVectorizationPatternsOp::populatePatterns( linalg::populatePadOpVectorizationPatterns(patterns); } +void transform::ApplyFoldIntoPackAndUnpackPatternsOp::populatePatterns( + RewritePatternSet &patterns) { + linalg::populateFoldIntoPackAndUnpackPatterns(patterns); +} + +void transform::ApplyFoldPackUnpackIntoEmptyPatternsOp::populatePatterns( + RewritePatternSet &patterns) { + linalg::populateFoldPackUnpackIntoTensorEmptyPatterns(patterns); +} + //===----------------------------------------------------------------------===// // BufferizeToAllocationOp //===----------------------------------------------------------------------===// @@ -1170,7 +1180,7 @@ LogicalResult transform::InterchangeOp::verify() { //===----------------------------------------------------------------------===// DiagnosedSilenceableFailure transform::LowerPackOp::applyToOne( - transform::TransformRewriter &rewriter, tensor::PackOp target, + transform::TransformRewriter &rewriter, linalg::PackOp target, transform::ApplyToEachResultList &transformResults, transform::TransformState &state) { rewriter.setInsertionPoint(target); @@ -1192,7 +1202,7 @@ DiagnosedSilenceableFailure transform::LowerPackOp::applyToOne( //===----------------------------------------------------------------------===// DiagnosedSilenceableFailure transform::LowerUnPackOp::applyToOne( - transform::TransformRewriter &rewriter, tensor::UnPackOp target, + transform::TransformRewriter &rewriter, linalg::UnPackOp target, transform::ApplyToEachResultList &transformResults, transform::TransformState &state) { rewriter.setInsertionPoint(target); @@ -1622,7 +1632,7 @@ bool isValidPackingPermutation( RelayoutOpTy op, ArrayRef permutation, OuterOrInnerPerm outerOrInnerPerm = OuterOrInnerPerm::Outer) { static_assert( - llvm::is_one_of::value, + llvm::is_one_of::value, "applies to only pack or unpack operations"); if (!op || permutation.empty()) return true; @@ -1631,7 +1641,7 @@ bool isValidPackingPermutation( return permutation.size() == innerRank && isPermutationVector(permutation); // op.getOuterDimsPerm() may be empty, in which case it is identity. // Don't rely on it. - if (std::is_same::value) { + if (std::is_same::value) { return permutation.size() == op.getSourceRank() && isPermutationVector(permutation); } @@ -1665,11 +1675,11 @@ transform::PackTransposeOp::apply(transform::TransformRewriter &rewriter, } // Step 2.2. Fail on wrong type. - auto packOp = dyn_cast(*packOrUnpackOps.begin()); - auto unPackOp = dyn_cast(*packOrUnpackOps.begin()); + auto packOp = dyn_cast(*packOrUnpackOps.begin()); + auto unPackOp = dyn_cast(*packOrUnpackOps.begin()); if ((!packOp && !unPackOp)) { return emitSilenceableError() << "requires target to map to a " - "tensor.pack or tensor.unpack"; + "linalg.pack or linalg.unpack"; } LinalgOp linalgOpTarget = dyn_cast(*linalgOps.begin()); if (!linalgOpTarget) @@ -1694,7 +1704,7 @@ transform::PackTransposeOp::apply(transform::TransformRewriter &rewriter, assert(!packOp && "packOp must be null on entry when unPackOp is not null"); OpOperand *packUse = linalgOp.getDpsInitOperand( cast(unPackOp.getSource()).getResultNumber()); - packOp = dyn_cast_or_null(packUse->get().getDefiningOp()); + packOp = dyn_cast_or_null(packUse->get().getDefiningOp()); if (!packOp || !packOp.getResult().hasOneUse()) return emitSilenceableError() << "could not find matching pack op"; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp index 7f9a0f7a6ca43..81842e4bea631 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp @@ -88,7 +88,7 @@ static bool validateFullTilesOnDims(linalg::LinalgOp linalgOp, /// Return failure or packed matmul with one of its operands transposed. static FailureOr transposePackedMatmul(RewriterBase &rewriter, linalg::LinalgOp linalgOp, - tensor::PackOp packOp, AffineMap operandMap, + linalg::PackOp packOp, AffineMap operandMap, ArrayRef blocksStartDimPos, bool transposeOuterBlocks, bool transposeInnerBlocks) { assert(operandMap.getNumDims() >= 4 && diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt index 3594b08413812..d18b6f8afc43b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt @@ -26,6 +26,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms MeshShardingInterfaceImpl.cpp NamedOpConversions.cpp BlockPackMatmul.cpp + PackAndUnpackPatterns.cpp Padding.cpp Promotion.cpp RuntimeOpVerification.cpp diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp index c906f3bdcc632..9f5000b70b6f6 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp @@ -61,7 +61,7 @@ template static FailureOr getPackingInfoFromOperand(OpOperand *opOperand, linalg::GenericOp genericOp, OpTy packOrUnPackOp) { - static_assert(llvm::is_one_of::value, + static_assert(llvm::is_one_of::value, "applies to only pack or unpack operations"); LLVM_DEBUG( { llvm::dbgs() << "--- Construct PackInfo From an operand ---\n"; }); @@ -210,7 +210,7 @@ static SmallVector computeOuterDims(ArrayRef perm, /// %4 = arith.addf %arg3, %arg4 : f32 /// linalg.yield %4 : f32 /// } -> tensor -/// %1 = tensor.pack %0 +/// %1 = linalg.pack %0 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %dest : tensor -> tensor @@ -219,7 +219,7 @@ static SmallVector computeOuterDims(ArrayRef perm, /// 8. Thus, the below operation and `affine_map<(d0, d1, d2, d3)> -> /// affine_map<(d1, d3)>` will be returned. /// -/// %pack = tensor.pack %arg0 +/// %pack = linalg.pack %arg0 /// inner_dims_pos = [0] /// inner_tiles = [8] /// into %init : tensor -> tensor @@ -290,9 +290,9 @@ getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo, if (innerDimsPos.empty() && outerDimsPerm.empty()) return std::make_tuple(opOperand->get(), indexingMap); - auto empty = tensor::PackOp::createDestinationTensor( + auto empty = linalg::PackOp::createDestinationTensor( b, loc, opOperand->get(), innerTileSizes, innerDimsPos, outerDimsPerm); - auto packedOperand = b.create( + auto packedOperand = b.create( loc, opOperand->get(), empty, innerDimsPos, innerTileSizes, /*padding=*/std::nullopt, outerDimsPerm); return std::make_tuple(packedOperand, indexingMap); @@ -327,7 +327,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, return newGenericOp; } -/// Bubbles up tensor.pack op through a producer generic op. This +/// Bubbles up linalg.pack op through a producer generic op. This /// swap pack(generic) to generic(pack). The new generic op works on packed /// domain; pack ops are created for input and output operands. E.g., /// @@ -343,7 +343,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, /// %4 = arith.addf %arg3, %arg3 : f32 /// linalg.yield %4 : f32 /// } -> tensor -/// %4 = tensor.pack %3 +/// %4 = linalg.pack %3 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %dest : tensor -> tensor @@ -358,7 +358,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, /// %0 = affine.apply #map()[%dim] /// %1 = affine.apply #map1()[%dim_0] /// %2 = tensor.empty(%0, %1) : tensor -/// %pack = tensor.pack %arg0 +/// %pack = linalg.pack %arg0 /// inner_dims_pos = [0, 1] /// inner_tiles = [8, 2] /// into %2 : tensor -> tensor @@ -371,7 +371,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, /// linalg.yield %4 : f32 /// } -> tensor static FailureOr -bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, +bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, linalg::PackOp packOp, const ControlPropagationFn &controlFn) { auto genericOp = packOp.getSource().getDefiningOp(); if (!genericOp) @@ -416,11 +416,11 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, rewriter.setInsertionPoint(genericOp); // We need to handle two cases: - // 1) The tensor.pack destination is a tensor.empty. If this is the case, we + // 1) The linalg.pack destination is a tensor.empty. If this is the case, we // create a new tensor.empty to avoid breaking dominance, as we are moving the - // tensor.pack above the linalg.generic. + // linalg.pack above the linalg.generic. // 2) The destination is not a tensor.empty. In this case we can replace only - // if the destination of the tensor.pack dominates the linalg.generic. + // if the destination of the linalg.pack dominates the linalg.generic. Value packOpDest = packOp.getDest(); if (!packOpDest.hasOneUse()) return failure(); @@ -453,13 +453,13 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp, /// Wrapper pattern that applies bubbleUpPackOpThroughGenericOp method. struct BubbleUpPackOpThroughGenericOpPattern - : public OpRewritePattern { + : public OpRewritePattern { public: BubbleUpPackOpThroughGenericOpPattern(MLIRContext *context, ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) {} + : OpRewritePattern(context), controlFn(std::move(fun)) {} - LogicalResult matchAndRewrite(tensor::PackOp packOp, + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { auto genericOp = bubbleUpPackOpThroughGenericOp(rewriter, packOp, controlFn); @@ -473,15 +473,15 @@ struct BubbleUpPackOpThroughGenericOpPattern ControlPropagationFn controlFn; }; -/// Propagate a tensor.pack operation up through a tensor.pad. The idea is to +/// Propagate a linalg.pack operation up through a tensor.pad. The idea is to /// add as many zero padding dimensions in `high` and `low` based on the number /// of point loops. -class BubbleUpPackThroughPadOp final : public OpRewritePattern { +class BubbleUpPackThroughPadOp final : public OpRewritePattern { public: BubbleUpPackThroughPadOp(MLIRContext *context, ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) {} + : OpRewritePattern(context), controlFn(std::move(fun)) {} - LogicalResult matchAndRewrite(tensor::PackOp packOp, + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { auto padOp = packOp.getSource().getDefiningOp(); if (!padOp) @@ -522,10 +522,10 @@ class BubbleUpPackThroughPadOp final : public OpRewritePattern { ArrayRef outerDimsPerm = packOp.getOuterDimsPerm(); SmallVector mixedTiles = packOp.getMixedTiles(); - auto empty = tensor::PackOp::createDestinationTensor( + auto empty = linalg::PackOp::createDestinationTensor( rewriter, loc, padOp.getSource(), mixedTiles, innerDimsPos, outerDimsPerm); - auto sourcePack = rewriter.create( + auto sourcePack = rewriter.create( loc, padOp.getSource(), empty, innerDimsPos, mixedTiles, /*padding=*/std::nullopt, outerDimsPerm); @@ -549,9 +549,9 @@ class BubbleUpPackThroughPadOp final : public OpRewritePattern { // If the pad has more than one user, create an unpack on the new pad to // replace the other uses. if (!padOp->hasOneUse()) { - auto unpackEmpty = tensor::UnPackOp::createDestinationTensor( + auto unpackEmpty = linalg::UnPackOp::createDestinationTensor( rewriter, loc, newPadOp, mixedTiles, innerDimsPos, outerDimsPerm); - Value unpackedPad = rewriter.create( + Value unpackedPad = rewriter.create( loc, newPadOp, unpackEmpty, innerDimsPos, mixedTiles, outerDimsPerm); rewriter.replaceAllUsesExcept(padOp, unpackedPad, sourcePack); } @@ -636,20 +636,20 @@ static int64_t applyPermutationAndReindexReassoc( /// /// %collapsed = tensor.collapse_shape %in [[0, 1], 2] /// : tensor into tensor -/// %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] +/// %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] /// inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %empty /// : tensor -> tensor /// /// can be transformed into: /// -/// %pack = tensor.pack %in outer_dims_perm = [1, 2] +/// %pack = linalg.pack %in outer_dims_perm = [1, 2] /// inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %empty /// : tensor -> tensor /// %collapsed = tensor.collapse_shape %pack [[0, 1], 2, 3, 4] /// : tensor into tensor static LogicalResult bubbleUpPackOpThroughCollapseShape(tensor::CollapseShapeOp collapseOp, - tensor::PackOp packOp, + linalg::PackOp packOp, PatternRewriter &rewriter) { SmallVector innerTileSizes = packOp.getStaticTiles(); ArrayRef innerDimsPos = packOp.getInnerDimsPos(); @@ -682,10 +682,10 @@ bubbleUpPackOpThroughCollapseShape(tensor::CollapseShapeOp collapseOp, reassocIndices[outerPos].end()); } - auto emptyOp = tensor::PackOp::createDestinationTensor( + auto emptyOp = linalg::PackOp::createDestinationTensor( rewriter, packOp.getLoc(), collapseOp.getSrc(), packOp.getMixedTiles(), projectedInnerDimsPos, newOuterDimsPerm); - auto newPackOp = rewriter.create( + auto newPackOp = rewriter.create( packOp.getLoc(), collapseOp.getSrc(), emptyOp, projectedInnerDimsPos, packOp.getMixedTiles(), packOp.getPaddingValue(), newOuterDimsPerm); @@ -742,20 +742,20 @@ projectDimsPosIntoReassocPos(ArrayRef dimsPos, /// /// %expand = tensor.expand_shape %in [[0], [1, 2]] /// : tensor into tensor -/// %pack = tensor.pack %expand outer_dims_perm = [0, 1] +/// %pack = linalg.pack %expand outer_dims_perm = [0, 1] /// inner_dims_pos = [2] inner_tiles = [8] into %empty /// : tensor -> tensor /// /// can be transformed into: /// -/// %pack = tensor.pack %in outer_dims_perm = [1, 2] +/// %pack = linalg.pack %in outer_dims_perm = [1, 2] /// inner_dims_pos = [1] inner_tiles = [8] into %empty /// : tensor -> tensor /// %expand = tensor.expand_shape %pack [[0], [1, 2], [3]] /// : tensor into tensor static LogicalResult bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, - tensor::PackOp packOp, + linalg::PackOp packOp, PatternRewriter &rewriter) { // Outer dimensions permutation is not supported currently. // TODO: Handle outer_dims_perm variants. @@ -808,7 +808,7 @@ bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, // If reassociation is not possible, then reordering cannot happen. // This can be caused by pack padding affecting previously expanded // dimensions or packing extending dimensions. - RankedTensorType newPackType = tensor::PackOp::inferPackedType( + RankedTensorType newPackType = linalg::PackOp::inferPackedType( expandOp.getSrcType(), packOp.getStaticInnerTiles(), projectedInnerDimsPos, /*outerDimsPerm=*/SmallVector{}); auto reassocExpand = @@ -817,10 +817,10 @@ bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, return rewriter.notifyMatchFailure( packOp, "could not reassociate dims after bubbling up"); - Value destTensor = tensor::PackOp::createDestinationTensor( + Value destTensor = linalg::PackOp::createDestinationTensor( rewriter, packOp.getLoc(), expandOp.getSrc(), packOp.getMixedTiles(), projectedInnerDimsPos, /*outerDimsPerm=*/SmallVector{}); - Value packedVal = rewriter.create( + Value packedVal = rewriter.create( packOp.getLoc(), expandOp.getSrc(), destTensor, projectedInnerDimsPos, packOp.getMixedTiles(), packOp.getPaddingValue(), /*outerDimsPerm=*/SmallVector{}); @@ -833,12 +833,12 @@ bubbleUpPackOpThroughExpandShape(tensor::ExpandShapeOp expandOp, } class BubbleUpPackOpThroughReshapeOp final - : public OpRewritePattern { + : public OpRewritePattern { public: BubbleUpPackOpThroughReshapeOp(MLIRContext *context, ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) {} + : OpRewritePattern(context), controlFn(std::move(fun)) {} - LogicalResult matchAndRewrite(tensor::PackOp packOp, + LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { Operation *srcOp = packOp.getSource().getDefiningOp(); // Currently only support when the pack op is the only user. @@ -877,7 +877,7 @@ class BubbleUpPackOpThroughReshapeOp final /// /// For example: /// -/// %unpack = tensor.unpack %in outer_dims_perm = [0, 1] +/// %unpack = linalg.unpack %in outer_dims_perm = [0, 1] /// inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %empty /// : tensor -> tensor /// %expanded = tensor.expand_shape %unpack [[0, 1], [2]] @@ -887,11 +887,11 @@ class BubbleUpPackOpThroughReshapeOp final /// /// %expanded = tensor.expand_shape %ain [[0, 1], [2], [3], [4]] /// : tensor into tensor -/// %unpack = tensor.unpack %expanded outer_dims_perm = [0, 1, 2] +/// %unpack = linalg.unpack %expanded outer_dims_perm = [0, 1, 2] /// inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %empty /// : tensor -> tensor static LogicalResult pushDownUnPackOpThroughExpandShape( - tensor::UnPackOp unPackOp, tensor::ExpandShapeOp expandOp, + linalg::UnPackOp unPackOp, tensor::ExpandShapeOp expandOp, PatternRewriter &rewriter, ControlPropagationFn controlFn) { // User controlled propagation function. if (!controlFn(&expandOp.getSrcMutable())) @@ -943,16 +943,16 @@ static LogicalResult pushDownUnPackOpThroughExpandShape( nextPos += 1; } - RankedTensorType newExpandType = tensor::PackOp::inferPackedType( + RankedTensorType newExpandType = linalg::PackOp::inferPackedType( expandTy, innerTileSizes, projectedInnerDimsPos, newOuterDimsPerm); auto newExpandOp = rewriter.create( expandOp.getLoc(), newExpandType, unPackOp.getSource(), newReassocIndices); - auto emptyOp = tensor::UnPackOp::createDestinationTensor( + auto emptyOp = linalg::UnPackOp::createDestinationTensor( rewriter, unPackOp.getLoc(), newExpandOp, unPackOp.getMixedTiles(), projectedInnerDimsPos, newOuterDimsPerm); - auto newUnPackOp = rewriter.create( + auto newUnPackOp = rewriter.create( unPackOp.getLoc(), newExpandOp.getResult(), emptyOp, projectedInnerDimsPos, unPackOp.getMixedTiles(), newOuterDimsPerm); rewriter.replaceOp(expandOp, newUnPackOp); @@ -961,14 +961,14 @@ static LogicalResult pushDownUnPackOpThroughExpandShape( } class PushDownUnPackOpThroughReshapeOp final - : public OpRewritePattern { + : public OpRewritePattern { public: PushDownUnPackOpThroughReshapeOp(MLIRContext *context, ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) { + : OpRewritePattern(context), controlFn(std::move(fun)) { } - LogicalResult matchAndRewrite(tensor::UnPackOp unPackOp, + LogicalResult matchAndRewrite(linalg::UnPackOp unPackOp, PatternRewriter &rewriter) const override { Value result = unPackOp.getResult(); // Currently only support unpack op with the single user. @@ -1001,7 +1001,7 @@ class PushDownUnPackOpThroughReshapeOp final static FailureOr getUnPackedOperand(GenericOp genericOp) { OpOperand *unPackedOperand = nullptr; for (OpOperand &operand : genericOp->getOpOperands()) { - auto unPackOp = operand.get().getDefiningOp(); + auto unPackOp = operand.get().getDefiningOp(); if (!unPackOp) continue; if (unPackedOperand) @@ -1013,9 +1013,9 @@ static FailureOr getUnPackedOperand(GenericOp genericOp) { return unPackedOperand; } -/// Push down a tensor.unpack op through a generic op. +/// Push down a linalg.unpack op through a generic op. /// The new generic op works on packed domain; pack ops are created for input -/// and output operands. A tensor.unpack op is inserted right after the packed +/// and output operands. A linalg.unpack op is inserted right after the packed /// generic. E.g. /// /// #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> @@ -1023,7 +1023,7 @@ static FailureOr getUnPackedOperand(GenericOp genericOp) { /// %arg0 = tensor<12x2x56x56x32xf32> // packed arg. /// /// %0 = tensor.empty() : tensor<12x56x56x64xf32> -/// %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] +/// %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] /// inner_dims_pos = [3] inner_tiles = [32] into %0 /// %2 = linalg.generic {indexing_maps = [#map], /// iterator_types = ["parallel", "parallel", "parallel", "parallel"]} @@ -1044,7 +1044,7 @@ static FailureOr getUnPackedOperand(GenericOp genericOp) { /// ^bb0(%out : f32): /// linalg.yield %out : f32 /// } -> tensor<12x2x56x56x32xf32> -/// %2 = tensor.unpack %1 outer_dims_perm = [0, 3, 1, 2] +/// %2 = linalg.unpack %1 outer_dims_perm = [0, 3, 1, 2] /// inner_dims_pos = [3] inner_tiles = [32] into %0 /// static FailureOr> @@ -1063,8 +1063,8 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, OpOperand *unPackedOperand = *(maybeUnPackedOperand); // Extract packing information. - tensor::UnPackOp producerUnPackOp = - unPackedOperand->get().getDefiningOp(); + linalg::UnPackOp producerUnPackOp = + unPackedOperand->get().getDefiningOp(); assert(producerUnPackOp && "expect a valid UnPackOp"); if (!controlFn(unPackedOperand)) @@ -1079,7 +1079,7 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, auto [packedOutOperand, packedOutIndexingMap] = getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), *packInfo, genericOp, genericOp.getDpsInitOperand(0)); - auto destPack = packedOutOperand.getDefiningOp(); + auto destPack = packedOutOperand.getDefiningOp(); // If the dps init operand of the generic is a tensor.empty, do not pack it // and forward the new tensor.empty as a destination. @@ -1108,7 +1108,7 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, // Insert an unPackOp right after the packed generic. Value unPackOpRes = rewriter - .create(genericOp.getLoc(), newResult, + .create(genericOp.getLoc(), newResult, destPack.getSource(), innerDimsPos, mixedTiles, outerDimsPerm) .getResult(); @@ -1137,7 +1137,7 @@ struct PushDownUnPackOpThroughGenericOp : public OpRewritePattern { ControlPropagationFn controlFn; }; -/// Propagate a tensor.unpack operation through a tensor.pad. The idea is to +/// Propagate a linalg.unpack operation through a tensor.pad. The idea is to /// add as many zero padding dimensions in `high` and `low` based on the number /// of point loops. struct PushDownUnPackThroughPadOp : public OpRewritePattern { @@ -1146,8 +1146,8 @@ struct PushDownUnPackThroughPadOp : public OpRewritePattern { LogicalResult matchAndRewrite(tensor::PadOp padOp, PatternRewriter &rewriter) const override { - tensor::UnPackOp unpackOp = - padOp.getSource().getDefiningOp(); + linalg::UnPackOp unpackOp = + padOp.getSource().getDefiningOp(); if (!unpackOp) return failure(); @@ -1185,12 +1185,12 @@ struct PushDownUnPackThroughPadOp : public OpRewritePattern { loc, /*result=*/Type(), unpackOp.getSource(), lowPad, highPad, paddingVal, padOp.getNofold()); - // Inject the tensor.unpack right after the packed padOp. + // Inject the linalg.unpack right after the packed padOp. Value outputUnPack = rewriter.create( loc, padOp.getResultType().getShape(), padOp.getResultType().getElementType()); - Value replacement = rewriter.create( + Value replacement = rewriter.create( loc, newPadOp.getResult(), outputUnPack, innerDimsPos, unpackOp.getMixedTiles(), outerDimsPerm); rewriter.replaceOp(padOp, replacement); diff --git a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp index 83c4b5bdf1097..ae8cb94661c76 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp @@ -223,21 +223,21 @@ LogicalResult DecomposeProjectedPermutation::matchAndRewrite( newMap[i] = rewriter.getMultiDimIdentityMap(map.getNumDims()); } - if (isChanged) { - SmallVector operands = op->getOperands(); - ValueRange operandsRef(operands); - - auto newOp = rewriter.create( - /*location=*/op.getLoc(), - /*resultTensorTypes=*/op->getResultTypes(), - /*inputs=*/newInitValues, - /*outputs=*/operandsRef.drop_front(op.getNumDpsInputs()), - /*indexingMaps=*/newMap, - /*iteratorTypes=*/op.getIteratorTypesArray()); - - newOp.getRegion().takeBody(op->getRegion(0)); - rewriter.replaceOp(op, newOp->getResults()); - } + if (!isChanged) + return failure(); + + SmallVector operands = op->getOperands(); + ValueRange operandsRef(operands); + + auto newOp = rewriter.create( + /*location=*/op.getLoc(), + /*resultTensorTypes=*/op->getResultTypes(), + /*inputs=*/newInitValues, + /*outputs=*/operandsRef.drop_front(op.getNumDpsInputs()), + /*indexingMaps=*/newMap, + /*iteratorTypes=*/op.getIteratorTypesArray()); + newOp.getRegion().takeBody(op->getRegion(0)); + rewriter.replaceOp(op, newOp->getResults()); return success(); } diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Linalg/Transforms/PackAndUnpackPatterns.cpp similarity index 90% rename from mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp rename to mlir/lib/Dialect/Linalg/Transforms/PackAndUnpackPatterns.cpp index 3566714c6529e..0984b6988b93b 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/PackAndUnpackPatterns.cpp @@ -13,7 +13,7 @@ #include "mlir/IR/PatternMatch.h" namespace mlir { -namespace tensor { +namespace linalg { namespace { /// Returns the number of shape sizes that is either dynamic or greater than 1. @@ -201,7 +201,7 @@ struct FoldPadWithPackOp : public OpRewritePattern { LogicalResult matchAndRewrite(PackOp packOp, PatternRewriter &rewriter) const override { - auto padOp = packOp.getSource().getDefiningOp(); + auto padOp = packOp.getSource().getDefiningOp(); if (!padOp || padOp.getNofold() || !padOp.hasZeroLowPad()) return failure(); @@ -224,10 +224,11 @@ struct FoldPadWithPackOp : public OpRewritePattern { /// Fold a `unpack` -> `extract_slice` into the `unpack` since it already /// has extract_slice semantics. -struct FoldUnpackWithExtractSliceOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct FoldUnpackWithExtractSliceOp + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(ExtractSliceOp sliceOp, + LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const override { auto unpackOp = sliceOp.getSource().getDefiningOp(); if (!unpackOp) @@ -247,7 +248,7 @@ struct FoldUnpackWithExtractSliceOp : public OpRewritePattern { // Create a new empty output tensor. Type elementType = unpackOp.getDestType().getElementType(); - Value output = rewriter.create( + Value output = rewriter.create( sliceOp.getLoc(), sliceOp.getMixedSizes(), elementType); rewriter.replaceOpWithNewOp( sliceOp, unpackOp.getSource(), output, unpackOp.getInnerDimsPos(), @@ -474,6 +475,50 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp return success(); } }; + +/// tensor.empty does not define any tensor contents, so an unpadded pack +/// can be folded away. +struct FoldEmptyTensorWithPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(PackOp packOp, + PatternRewriter &rewriter) const override { + // Check for tensor.empty source. + auto emptyOp = packOp.getSource().getDefiningOp(); + if (!emptyOp) + return failure(); + + // Check for padding. + // Packing with padding cannot be simply removed. + if (packOp.getPaddingValue()) + return rewriter.notifyMatchFailure(packOp, "expects no padding value"); + + // Replace the pack directly with its destination. + rewriter.replaceOp(packOp, packOp.getDest()); + + return success(); + } +}; + +/// tensor.empty does not define any tensor contents, so an unpack +/// can be folded away. +struct FoldEmptyTensorWithUnPackOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(UnPackOp unPackOp, + PatternRewriter &rewriter) const override { + // Check for tensor.empty source. + auto emptyOp = unPackOp.getSource().getDefiningOp(); + if (!emptyOp) + return failure(); + + // Replace the unpack directly with its destination. + rewriter.replaceOp(unPackOp, unPackOp.getDest()); + + return success(); + } +}; + } // namespace void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns) { @@ -490,5 +535,11 @@ void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns) { patterns.getContext()); } -} // namespace tensor +void populateFoldPackUnpackIntoTensorEmptyPatterns( + RewritePatternSet &patterns) { + patterns.add( + patterns.getContext()); +} + +} // namespace linalg } // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp index 269272c10903c..36daea5f8fd3f 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp @@ -10,14 +10,17 @@ #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Interfaces/TilingInterface.h" +#include "mlir/Interfaces/ValueBoundsOpInterface.h" #include using namespace mlir; @@ -571,6 +574,648 @@ struct LinalgOpPartialReductionInterface } }; +template +static SmallVector getPackUnPackIterationDomain(OpTy op, + OpBuilder &builder) { + static_assert(llvm::is_one_of::value, + "applies to only pack or unpack operations"); + OpBuilder::InsertionGuard g(builder); + int64_t rank = (std::is_same::value) ? op.getSourceRank() + : op.getDestRank(); + OpFoldResult zero = builder.getIndexAttr(0); + OpFoldResult one = builder.getIndexAttr(1); + ReifiedRankedShapedTypeDims resultShape; + (void)reifyResultShapes(builder, op, resultShape); + SmallVector loopBounds(rank); + for (auto dim : llvm::seq(0, rank)) { + loopBounds[dim].offset = zero; + loopBounds[dim].stride = one; + loopBounds[dim].size = resultShape[0][dim]; + } + return loopBounds; +} + +static void applyPermToRange(SmallVector &offsets, + SmallVector &sizes, + ArrayRef permutation) { + if (permutation.empty()) + return; + applyPermutationToVector(offsets, permutation); + applyPermutationToVector(sizes, permutation); +} + +struct PackOpTiling + : public TilingInterface::ExternalModel { + + SmallVector getLoopIteratorTypes(Operation *op) const { + // Note that here we only consider untiled dimensions and outer tiled data + // dimensions, the inner tiled data dimensions are materialized when + // building the body of the operation. + auto packOp = cast(op); + SmallVector iteratorTypes( + packOp.getSourceRank(), utils::IteratorType::parallel); + return iteratorTypes; + } + + SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { + return getPackUnPackIterationDomain(cast(op), b); + } + + FailureOr + getTiledImplementation(Operation *op, OpBuilder &b, + ArrayRef offsets, + ArrayRef sizes) const { + auto packOp = cast(op); + Location loc = packOp.getLoc(); + + // The tiling is applied on interchanged dimensions. We have to undo the + // interchange to map sizes and offsets to the original input. + int64_t inputRank = packOp.getSourceRank(); + SmallVector origOffsets(offsets); + SmallVector origSizes(sizes); + applyPermToRange(origOffsets, origSizes, + invertPermutationVector(packOp.getOuterDimsPerm())); + + DenseMap dimAndTileMapping = + packOp.getDimAndTileMapping(); + SmallVector srcDimValues = + tensor::getMixedSizes(b, loc, packOp.getSource()); + SmallVector inputIndices, inputSizes; + for (auto dim : llvm::seq(0, inputRank)) { + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, dim1, sym; + bindDims(b.getContext(), dim0, dim1); + bindSymbols(b.getContext(), sym); + if (dimAndTileMapping.count(dim)) { + // If the data dimension is tiled, the i-th index is the product of + // offset_i and tile_i, and the i-th size is the product of sizes_i and + // tile_i. + auto avOffset = AV(dim0).bind(origOffsets[dim]); + auto avSize = AV(dim0).bind(origSizes[dim]); + auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); + inputIndices.push_back(ab.mul(avOffset, avTileSize)); + inputSizes.push_back(ab.mul(avSize, avTileSize)); + } else { + inputIndices.push_back(origOffsets[dim]); + inputSizes.push_back(origSizes[dim]); + } + + // Limit the size of the input operand for incomplete tiles. + if (packOp.getPaddingValue()) { + OpFoldResult dimSize = srcDimValues[dim]; + auto avDimSize = AV(dim0).bind(dimSize); + auto avInputIdx = AV(dim1).bind(inputIndices.back()); + inputSizes.back() = + ab.min({inputSizes.back(), ab.sub(avDimSize, avInputIdx)}); + } + } + + auto oneAttr = b.getI64IntegerAttr(1); + SmallVector strides(inputRank, oneAttr); + + SmallVector tiledOperands; + auto sourceSlice = b.create( + loc, packOp.getSource(), inputIndices, inputSizes, strides); + tiledOperands.push_back(sourceSlice); + + SmallVector outputOffsets, outputSizes; + if (failed(getResultTilePosition(op, b, 0, offsets, sizes, outputOffsets, + outputSizes))) + return {}; + + strides.append(packOp.getDestRank() - inputRank, oneAttr); + auto outSlice = b.create( + loc, packOp.getDest(), outputOffsets, outputSizes, strides); + tiledOperands.push_back(outSlice); + + if (auto val = packOp.getPaddingValue()) + tiledOperands.push_back(val); + for (auto tile : packOp.getInnerTiles()) + tiledOperands.push_back(tile); + + Operation *tiledPackOp = b.create( + loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); + + return TilingResult{ + {tiledPackOp}, + SmallVector(tiledPackOp->getResults()), + llvm::to_vector(ArrayRef{sourceSlice, outSlice})}; + } + + LogicalResult + getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes, + SmallVector &resultOffsets, + SmallVector &resultSizes) const { + // The iteration domain is over outer dimensions of packed layout. In this + // context, the outer dimensions of `resultOffsets` are `offsets`. The + // inner dimensions of `resultOffsets` are zeros because tiling is not + // applied to them. + auto packOp = cast(op); + int64_t inputRank = packOp.getSourceRank(); + int64_t outputRank = packOp.getDestRank(); + auto zeroAttr = b.getI64IntegerAttr(0); + resultOffsets.assign(offsets.begin(), offsets.end()); + resultOffsets.append(outputRank - inputRank, zeroAttr); + + ReifiedRankedShapedTypeDims outputShape; + (void)reifyResultShapes(b, packOp, outputShape); + resultSizes.assign(sizes.begin(), sizes.end()); + for (auto dataTileDim : llvm::seq(inputRank, outputRank)) + resultSizes.push_back(outputShape[0][dataTileDim]); + + return success(); + } + + FailureOr + generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes) const { + auto packOp = cast(op); + int64_t numTiles = packOp.getInnerDimsPos().size(); + + // tensor.pack op is fusible (as a producer) only if full inner tiles are + // iterated or inner dims are not tiled. Otherwise, it will generate a + // sequence of non-trivial ops (for partial tiles). + for (auto offset : offsets.take_back(numTiles)) + if (!isConstantIntValue(offset, 0)) + return failure(); + + for (auto iter : + llvm::zip_equal(packOp.getMixedTiles(), sizes.take_back(numTiles))) + if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) + return failure(); + + FailureOr tilingResult = getTiledImplementation( + op, b, offsets.drop_back(numTiles), sizes.drop_back(numTiles)); + if (failed(tilingResult)) + return failure(); + return tilingResult.value(); + } + + /// Method to return the position of iteration domain tile computed by the + /// tiled operation. In current `tensor.pack` context, the `resultOffsets` and + /// `resultSizes` only cover outer dimensions. + LogicalResult getIterationDomainTileFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes, + SmallVectorImpl &resultOffsets, + SmallVectorImpl &resultSizes) const { + if (operandNumber != 0) + return failure(); + + auto packOp = cast(op); + // It is not trivial to infer dest tile from source tile if `packOp` has + // padding semantic. + if (packOp.getPaddingValue()) + return failure(); + + Location loc = packOp.getLoc(); + + SmallVector outerDimOffsets, outerDimSizes; + DenseMap dimAndTileMapping = + packOp.getDimAndTileMapping(); + for (auto dim : llvm::seq(packOp.getSourceRank())) { + if (dimAndTileMapping.count(dim)) { + FailureOr cstSize = + ValueBoundsConstraintSet::computeConstantBound( + presburger::BoundType::UB, sizes[dim], + /*stopCondition=*/nullptr, /*closedUB=*/true); + std::optional cstInnerSize = + getConstantIntValue(dimAndTileMapping[dim]); + // Currently fusing `packOp` as consumer only expects perfect tiling + // scenario because even if without padding semantic, the `packOp` may + // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>, + // where the `tileSize` from operand of `packOp` is 5, which is not + // exactly divided by `innerTile`(=6) of `packOp`. As the result: + // 1. the first slice is extracted from (0) to (4) and inserted into + // (0,0)~(0,4) at first row. + // 2. the second slice is extracted from (5) to (9) and SHOULD BE + // respectively inserted into two rows with different length, including + // first row: (0,5) and second row (1,0)~(1,3). It is hard to coordinate + // them, thus adding below constraint to bypass them temporarily. In + // another word, we can only support tiling with consumer if the tile + // size for the producer is a multiple of the inner tile size for the + // packed dimensions at this moment. + if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) { + return failure(); + } + + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, sym; + bindDims(b.getContext(), dim0); + bindSymbols(b.getContext(), sym); + auto avOffset = AV(dim0).bind(offsets[dim]); + auto avSize = AV(dim0).bind(sizes[dim]); + auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); + outerDimOffsets.push_back(ab.floor(avOffset, avTileSize)); + outerDimSizes.push_back(ab.ceil(avSize, avTileSize)); + } else { + outerDimOffsets.push_back(offsets[dim]); + outerDimSizes.push_back(sizes[dim]); + } + } + applyPermToRange(outerDimOffsets, outerDimSizes, packOp.getOuterDimsPerm()); + resultOffsets = outerDimOffsets; + resultSizes = outerDimSizes; + return success(); + } + + /// Method to return the tiled implementation of tensor.pack as a consumer. + FailureOr getTiledImplementationFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes) const { + if (operandNumber != 0) + return failure(); + + auto packOp = cast(op); + Location loc = packOp.getLoc(); + + int64_t inputRank = packOp.getSourceRank(); + auto oneAttr = b.getI64IntegerAttr(1); + SmallVector strides(inputRank, oneAttr); + + SmallVector tiledOperands; + auto sourceSlice = b.create( + loc, packOp.getSource(), offsets, sizes, strides); + tiledOperands.push_back(sourceSlice); + + SmallVector outerDimOffsets, outerDimSizes; + if (failed(getIterationDomainTileFromOperandTile( + op, b, /*operandNumber=*/0, offsets, sizes, outerDimOffsets, + outerDimSizes))) + return failure(); + + SmallVector outputOffsets, outputSizes; + if (failed(getResultTilePosition(op, b, 0, outerDimOffsets, outerDimSizes, + outputOffsets, outputSizes))) + return failure(); + + strides.append(packOp.getDestRank() - inputRank, oneAttr); + auto outSlice = b.create( + loc, packOp.getDest(), outputOffsets, outputSizes, strides); + tiledOperands.push_back(outSlice); + + assert(!packOp.getPaddingValue() && "Expect no padding semantic"); + for (auto tile : packOp.getInnerTiles()) + tiledOperands.push_back(tile); + + Operation *tiledPackOp = b.create( + loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); + + return TilingResult{ + {tiledPackOp}, + SmallVector(tiledPackOp->getResults()), + llvm::to_vector(ArrayRef{sourceSlice, outSlice})}; + } +}; + +struct UnpackTileDimInfo { + bool isAlignedToInnerTileSize; + OpFoldResult sourceOffset; + OpFoldResult sourceSize; + OpFoldResult resultOffset; + OpFoldResult destExpandedSize; +}; + +/// Returns the needed information for tiling unpack op on `tileDim` with given +/// `tileOffset` and `tileSize`. For more details, see the comment of the +/// `getTiledImplementation`. +static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp, + int64_t tileDim, + OpFoldResult tileOffset, + OpFoldResult tileSize) { + UnpackTileDimInfo info; + Attribute zeroAttr = b.getIndexAttr(0); + Attribute oneAttr = b.getIndexAttr(1); + DenseMap dimAndTileMapping = + unpackOp.getDimAndTileMapping(); + // The dimension is not one of packed data dimension. + if (!dimAndTileMapping.count(tileDim)) { + info.isAlignedToInnerTileSize = true; + info.sourceOffset = tileOffset; + info.sourceSize = tileSize; + info.resultOffset = zeroAttr; + info.destExpandedSize = tileSize; + return info; + } + + Location loc = unpackOp.getLoc(); + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, dim1, sym0; + bindDims(b.getContext(), dim0, dim1); + bindSymbols(b.getContext(), sym0); + + OpFoldResult innerTileSize = dimAndTileMapping[tileDim]; + + info.isAlignedToInnerTileSize = false; + FailureOr cstSize = ValueBoundsConstraintSet::computeConstantBound( + presburger::BoundType::UB, tileSize, + /*stopCondition=*/nullptr, /*closedUB=*/true); + std::optional cstInnerSize = getConstantIntValue(innerTileSize); + if (!failed(cstSize) && cstInnerSize) { + if (*cstSize % *cstInnerSize == 0) + info.isAlignedToInnerTileSize = true; + + // If the tiling size equals to the inner tiling size, the outer dims are + // always 1. + if (*cstInnerSize == *cstSize) { + auto lhs = AV(dim0).bind(tileOffset); + auto rhs = AV(dim1).bind(innerTileSize); + info.sourceOffset = ab.floor(lhs, rhs); + info.sourceSize = oneAttr; + info.resultOffset = zeroAttr; + info.destExpandedSize = tileSize; + return info; + } + } + + if (info.isAlignedToInnerTileSize) { + info.sourceOffset = + ab.floor(AV(dim0).bind(tileOffset), AV(dim1).bind(innerTileSize)); + info.resultOffset = zeroAttr; + info.destExpandedSize = tileSize; + + // The ceilDiv is needed here because there could be incomplete tile even + // it is perfect tiling cases. E.g., + // %0 = unpack tensor<33x2xf32> into tensor<64xf32> + // If the tiling size is 32, there will be 3 tiles. Two of them have + // size=32; one of them have size=2. The size is represented using + // affine_min op; we need ceilDiv. + info.sourceSize = + ab.ceil(AV(dim0).bind(tileSize), AV(dim1).bind(innerTileSize)); + return info; + } + + affine::DivModValue firstCoord = affine::getDivMod( + b, loc, getValueOrCreateConstantIndexOp(b, loc, tileOffset), + getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); + OpFoldResult tileExclusiveBound = + ab.add(AV(dim0).bind(tileOffset), AV(dim1).bind(tileSize)); + affine::DivModValue lastCoord = affine::getDivMod( + b, loc, + getValueOrCreateConstantIndexOp( + b, loc, + ab.sub(AV(dim0).bind(tileExclusiveBound), AV(dim1).bind(oneAttr))), + getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); + + OpFoldResult lengthMinusOne = ab.sub(AV(dim0).bind(lastCoord.quotient), + AV(dim1).bind(firstCoord.quotient)); + info.sourceSize = + ab.add(AV(dim0).bind(lengthMinusOne), AV(dim1).bind(oneAttr)); + info.sourceOffset = firstCoord.quotient; + info.resultOffset = firstCoord.remainder; + // Do not create an Affine ops for expanded size because the affine op is too + // complicated which would trigger an issue in affine ops simplification. + info.destExpandedSize = b.createOrFold( + loc, getValueOrCreateConstantIndexOp(b, loc, info.sourceSize), + getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); + return info; +} + +struct UnPackOpTiling + : public TilingInterface::ExternalModel { + + SmallVector getLoopIteratorTypes(Operation *op) const { + auto unpackOp = cast(op); + SmallVector iteratorTypes( + unpackOp.getDestRank(), utils::IteratorType::parallel); + return iteratorTypes; + } + + SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { + return getPackUnPackIterationDomain(cast(op), b); + } + + /// There are two cases in tiling unpack ops. If the tiling size is aligned to + /// the inner tile size, the corresponding tiles of source are all complete. + /// Otherwise, there are in-complete tiles. We will need to expand the slice + /// of source for getting complete tiles. The tiled unpack op unpacks more + /// data from source, so We'll need an extract_slice op to shift and truncate + /// the output. + /// Take Nn_to_N as an example. Say that N=32, n=8, and tiling_size=15. The + /// coordinates of second tile (i.e., result[15..31]) are + /// [(1, 7), (2, 0,), (2, 1) ... (3, 6), (3, 7)]. The first row and the last + /// row are incomplete tiles. To represent the unpack op, we have to complete + /// the rows. I.e., the input coordinates would start with (1, 0); end with + /// (3, 7). In this context, the tiled unpack produces a (3 * n) elements + /// because there are 3 rows in total. Follow by a tensor.extract_slice op, we + /// can get the actual result. + FailureOr + getTiledImplementation(Operation *op, OpBuilder &b, + ArrayRef offsets, + ArrayRef sizes) const { + auto unpackOp = cast(op); + int64_t srcRank = unpackOp.getSourceRank(); + int64_t destRank = unpackOp.getDestRank(); + int64_t numInnerTiles = srcRank - destRank; + Location loc = unpackOp.getLoc(); + + // The perfect tiling case indicates that the tiling sizes are multiple of + // inner_tile_size. In this context, no extra data is needed when + // representing the tiled unpack op. + bool isPerfectTilingCase = true; + Attribute oneAttr = b.getIndexAttr(1); + SmallVector sliceSrcStrides(destRank, oneAttr); + SmallVector sliceSrcIndices, sliceSrcSizes; + SmallVector destExpandedSizes, resultOffsetsFromDest; + for (auto dim : llvm::seq(0, destRank)) { + UnpackTileDimInfo info = + getUnpackTileDimInfo(b, unpackOp, dim, offsets[dim], sizes[dim]); + if (!info.isAlignedToInnerTileSize) + isPerfectTilingCase = false; + sliceSrcIndices.push_back(info.sourceOffset); + sliceSrcSizes.push_back(info.sourceSize); + destExpandedSizes.push_back(info.destExpandedSize); + resultOffsetsFromDest.push_back(info.resultOffset); + } + + // The tiling is applied on destination dimensions. We have to apply the + // interchange on source dimensions if outer_dims_perm is set. + applyPermToRange(sliceSrcIndices, sliceSrcSizes, + unpackOp.getOuterDimsPerm()); + Attribute zeroAttr = b.getIndexAttr(0); + sliceSrcIndices.append(numInnerTiles, zeroAttr); + sliceSrcSizes.append(unpackOp.getMixedTiles()); + sliceSrcStrides.append(numInnerTiles, oneAttr); + SmallVector generatedSlices; + tensor::ExtractSliceOp sliceSource = b.create( + loc, unpackOp.getSource(), sliceSrcIndices, sliceSrcSizes, + sliceSrcStrides); + generatedSlices.push_back(sliceSource); + + SmallVector destStrides(destRank, oneAttr); + Value sliceDest; + if (isPerfectTilingCase) { + auto destSliceOp = b.create( + loc, unpackOp.getDest(), offsets, sizes, destStrides); + sliceDest = destSliceOp; + generatedSlices.push_back(destSliceOp); + } else { + sliceDest = b.create( + loc, destExpandedSizes, unpackOp.getDestType().getElementType()); + } + + SmallVector tiledOperands = {sliceSource.getResult(), sliceDest}; + for (auto tile : unpackOp.getInnerTiles()) + tiledOperands.push_back(tile); + + Operation *tiledUnpackOp = b.create( + loc, TypeRange{sliceDest.getType()}, tiledOperands, op->getAttrs()); + + if (isPerfectTilingCase) + return TilingResult{{tiledUnpackOp}, + SmallVector(tiledUnpackOp->getResults()), + generatedSlices}; + + auto extractSlice = b.create( + loc, tiledUnpackOp->getResult(0), resultOffsetsFromDest, sizes, + destStrides); + return TilingResult{ + {tiledUnpackOp}, {extractSlice.getResult()}, generatedSlices}; + } + + LogicalResult + getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes, + SmallVector &resultOffsets, + SmallVector &resultSizes) const { + resultOffsets = llvm::to_vector(offsets); + resultSizes = llvm::to_vector(sizes); + return success(); + } + + FailureOr + generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, + ArrayRef offsets, + ArrayRef sizes) const { + FailureOr tilingResult = + getTiledImplementation(op, b, offsets, sizes); + if (failed(tilingResult)) + return failure(); + return tilingResult.value(); + } + + /// Method to return the position of iteration domain tile computed by the + /// tiled operation. + LogicalResult getIterationDomainTileFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes, + SmallVectorImpl &resultOffsets, + SmallVectorImpl &resultSizes) const { + auto unPackOp = cast(op); + // If the operand tile is the dest, then no adjustment is needed. + if (operandNumber == unPackOp.getDestMutable().getOperandNumber()) { + resultOffsets = llvm::to_vector(offsets); + resultSizes = llvm::to_vector(sizes); + return success(); + } + Location loc = unPackOp.getLoc(); + + int64_t numTiles = unPackOp.getInnerDimsPos().size(); + auto destOffsets = offsets.drop_back(numTiles); + auto destSizes = sizes.drop_back(numTiles); + // The tiling is applied on interchanged dimensions. We have to undo the + // interchange to map sizes and offsets to the original input. + int64_t outputRank = unPackOp.getDestRank(); + ReifiedRankedShapedTypeDims reifiedReturnShapes; + if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes))) + return failure(); + SmallVector outputMixedSizes = reifiedReturnShapes.front(); + SmallVector origOffsets(destOffsets); + SmallVector origSizes(destSizes); + applyPermToRange(origOffsets, origSizes, + invertPermutationVector(unPackOp.getOuterDimsPerm())); + + DenseMap dimAndTileMapping = + unPackOp.getDimAndTileMapping(); + + for (auto dim : llvm::seq(0, outputRank)) { + using AV = affine::AffineValueExpr; + affine::AffineBuilder ab(b, loc); + AffineExpr dim0, dim1, sym0; + bindDims(b.getContext(), dim0, dim1); + bindSymbols(b.getContext(), sym0); + if (dimAndTileMapping.count(dim)) { + // If the data dimension is tiled, the i-th index is the product of + // offset_i and tile_i, and the i-th size is the product of sizes_i and + // tile_i. The sizes must be clamped to the sizes of the unpack result. + auto avOffset = AV(dim0).bind(origOffsets[dim]); + auto avSize = AV(dim0).bind(origSizes[dim]); + auto avTileSize = AV(sym0).bind(dimAndTileMapping[dim]); + auto avResultSize = AV(dim0).bind(outputMixedSizes[dim]); + resultOffsets.push_back(ab.mul(avOffset, avTileSize)); + auto avResultOffset = AV(dim1).bind(resultOffsets.back()); + resultSizes.push_back(ab.min({ab.mul(avSize, avTileSize), + ab.sub(avResultSize, avResultOffset)})); + } else { + resultOffsets.push_back(origOffsets[dim]); + resultSizes.push_back(origSizes[dim]); + } + } + return success(); + } + + /// Method to return the tiled implementation of tensor.unpack as a consumer. + FailureOr getTiledImplementationFromOperandTile( + Operation *op, OpBuilder &b, unsigned operandNumber, + ArrayRef offsets, ArrayRef sizes) const { + auto unPackOp = cast(op); + // tensor.unpack op is fusible (as a consumer) only if inner dims are not + // tiled. + int64_t numTiles = unPackOp.getInnerDimsPos().size(); + for (auto iter : + llvm::zip_equal(unPackOp.getMixedTiles(), sizes.take_back(numTiles))) { + if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) + return failure(); + } + + Location loc = unPackOp.getLoc(); + + // Fetch offset/size for creating the slice of the dest operand of + // unpack op. + SmallVector outputOffsets, outputSizes; + if (failed(getIterationDomainTileFromOperandTile( + op, b, /*operandNumber=*/0, offsets, sizes, outputOffsets, + outputSizes))) + return failure(); + + auto oneAttr = b.getI64IntegerAttr(1); + int64_t outputRank = unPackOp.getDestRank(); + SmallVector strides(outputRank, oneAttr); + + SmallVector tiledOperands; + // Create slice of the dest operand. + auto extractDestSlice = b.create( + loc, unPackOp.getDest(), outputOffsets, outputSizes, strides); + tiledOperands.push_back(extractDestSlice); + + SmallVector inputOffsets, inputSizes; + strides.append(unPackOp.getSourceRank() - outputRank, oneAttr); + // Create slice of the source operand. + auto extractSourceSlice = b.create( + loc, unPackOp.getSource(), offsets, sizes, strides); + tiledOperands.insert(tiledOperands.begin(), extractSourceSlice); + for (auto tile : unPackOp.getInnerTiles()) + tiledOperands.push_back(tile); + + // Create tiled unpack op. + Operation *tiledUnPackOp = + b.create(loc, TypeRange{extractDestSlice.getType()}, + tiledOperands, op->getAttrs()); + + return TilingResult{{tiledUnPackOp}, + SmallVector(tiledUnPackOp->getResults()), + llvm::to_vector(ArrayRef{ + extractSourceSlice, extractDestSlice})}; + } +}; + } // namespace template @@ -592,8 +1237,18 @@ void mlir::linalg::registerTilingInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) { registerOne(ctx); + linalg::PackOp::attachInterface(*ctx); + linalg::UnPackOp::attachInterface(*ctx); registerAll< #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" >(ctx); }); } + +void mlir::linalg::registerTilingInterfaceExternalModelsForPackUnPackOps( + DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, LinalgDialect *dialect) { + linalg::PackOp::attachInterface(*ctx); + linalg::UnPackOp::attachInterface(*ctx); + }); +} diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index 50593b08ad74b..dcd50cc44f81b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -217,7 +217,7 @@ struct PackedOperandsDimList { } // namespace FailureOr linalg::lowerPack(RewriterBase &rewriter, - tensor::PackOp packOp, + linalg::PackOp packOp, bool lowerPadLikeWithInsertSlice) { // 1. Filter out NYI cases. auto packedTensorType = @@ -238,7 +238,7 @@ FailureOr linalg::lowerPack(RewriterBase &rewriter, PackingMetadata packingMetadata = computePackingMetadata( packedTensorType.getRank(), packOp.getInnerDimsPos()); SmallVector packedToStripMinedShapePerm = - tensor::getPackInverseDestPerm(packOp); + getPackInverseDestPerm(packOp); // 3. Compute the stripMinedShape: this is the packed shape before any outer // or inner permutations have been applied. @@ -353,7 +353,7 @@ FailureOr linalg::lowerPack(RewriterBase &rewriter, } FailureOr -linalg::lowerUnPack(RewriterBase &rewriter, tensor::UnPackOp unPackOp, +linalg::lowerUnPack(RewriterBase &rewriter, linalg::UnPackOp unPackOp, bool lowerUnpadLikeWithExtractSlice) { Location loc = unPackOp->getLoc(); OpBuilder::InsertionGuard g(rewriter); @@ -388,7 +388,7 @@ linalg::lowerUnPack(RewriterBase &rewriter, tensor::UnPackOp unPackOp, // before any outer or inner permutations have been applied. PackingMetadata packingMetadata; SmallVector packedToStripMinedShapePerm = - tensor::getUnPackInverseSrcPerm(unPackOp, packingMetadata); + getUnPackInverseSrcPerm(unPackOp, packingMetadata); // 2. Compute the stripMinedShape: this is the packed shape without outer and // inner permutations. @@ -493,8 +493,8 @@ FailureOr linalg::pack(RewriterBase &rewriter, llvm::interleaveComma(iteratorTypes, DBGS() << "iterators: "); DBGSNL();); - SmallVector packOps; - SmallVector unPackOps; + SmallVector packOps; + SmallVector unPackOps; // Step 1. Pack each dim of the LinalgOp metadata by packedSizes[i]. PackedOperandsDimList listOfPackedOperandsDim; for (int64_t i = 0, e = packedSizes.size(); i < e; ++i) { @@ -545,7 +545,7 @@ FailureOr linalg::pack(RewriterBase &rewriter, inputsAndInits.push_back(operand); continue; } - Value dest = tensor::PackOp::createDestinationTensor( + Value dest = linalg::PackOp::createDestinationTensor( rewriter, loc, operand, innerPackSizes, innerPos, /*outerDimsPerm=*/{}); ShapedType operandType = cast(operand.getType()); @@ -554,11 +554,11 @@ FailureOr linalg::pack(RewriterBase &rewriter, return getConstantIntValue(tile).has_value(); }); if (areConstantTiles && operandType.hasStaticShape() && - !tensor::PackOp::requirePaddingValue( + !linalg::PackOp::requirePaddingValue( operandType.getShape(), innerPos, cast(dest.getType()).getShape(), {}, innerPackSizes)) { - packOps.push_back(rewriter.create( + packOps.push_back(rewriter.create( loc, operand, dest, innerPos, innerPackSizes)); } else { // TODO: value of the padding attribute should be determined by @@ -566,7 +566,7 @@ FailureOr linalg::pack(RewriterBase &rewriter, auto zeroAttr = rewriter.getZeroAttr(getElementTypeOrSelf(dest.getType())); Value zero = rewriter.create(loc, zeroAttr); - packOps.push_back(rewriter.create( + packOps.push_back(rewriter.create( loc, operand, dest, innerPos, innerPackSizes, zero)); } inputsAndInits.push_back(packOps.back()); @@ -586,14 +586,14 @@ FailureOr linalg::pack(RewriterBase &rewriter, // Step 4. Propagate packing to all the op results. for (OpResult result : packedLinalgOp->getResults()) { int64_t resultNum = result.getResultNumber(); - tensor::PackOp maybePackedInit = - inits[resultNum].getDefiningOp(); + linalg::PackOp maybePackedInit = + inits[resultNum].getDefiningOp(); if (!maybePackedInit) { results.push_back(result); continue; } // Build the symmetrical UnPackOp to the existing PackOp. - unPackOps.push_back(rewriter.create( + unPackOps.push_back(rewriter.create( packedLinalgOp->getLoc(), result, maybePackedInit.getSource(), maybePackedInit.getInnerDimsPos(), maybePackedInit.getMixedTiles())); results.push_back(unPackOps.back()); @@ -674,15 +674,15 @@ static LinalgOp transposeOneLinalgOperandAndReplace( } FailureOr -linalg::packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, - linalg::LinalgOp linalgOp, tensor::UnPackOp maybeUnPackOp, +linalg::packTranspose(RewriterBase &rewriter, linalg::PackOp packOp, + linalg::LinalgOp linalgOp, linalg::UnPackOp maybeUnPackOp, ArrayRef outerPerm, ArrayRef innerPerm) { Location loc = linalgOp.getLoc(); // Step 1. Transpose packOp. rewriter.setInsertionPoint(packOp); - tensor::PackOp transposedPackOp = + linalg::PackOp transposedPackOp = packOp.createTransposedClone(rewriter, loc, innerPerm, outerPerm); if (!packOp.getResult().hasOneUse()) @@ -733,7 +733,7 @@ linalg::packTranspose(RewriterBase &rewriter, tensor::PackOp packOp, rewriter, linalgOp, packUse, permutation, transposedPackOp.getResult()); // Step 3. Maybe transpose unPackOp. - tensor::UnPackOp transposedUnPackOp; + linalg::UnPackOp transposedUnPackOp; if (maybeUnPackOp) { OpOperand &opOperand = transposedLinalgOp->getOpOperand(packUseOperandNumber); @@ -1024,7 +1024,7 @@ LogicalResult ExtractSliceOfPadTensorSwapPattern::matchAndRewrite( /// /// This method assumes that all outer dims for this pack Op are 1. static Value getPackOpSourceOrPaddedSource(OpBuilder &builder, - tensor::PackOp packOp) { + linalg::PackOp packOp) { Value input = packOp.getSource(); if (!packOp.getPaddingValue()) { return input; @@ -1141,7 +1141,7 @@ getPackUnpackRankReducedPerm(ArrayRef shape, } LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite( - tensor::PackOp packOp, PatternRewriter &rewriter) const { + linalg::PackOp packOp, PatternRewriter &rewriter) const { // TODO: support the case that outer dimensions are not all 1s. A // tensor.expand_shape will be generated in this case. if (llvm::any_of(packOp.getAllOuterDims(), @@ -1242,7 +1242,7 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite( } LogicalResult DecomposeOuterUnitDimsUnPackOpPattern::matchAndRewrite( - tensor::UnPackOp unpackOp, PatternRewriter &rewriter) const { + linalg::UnPackOp unpackOp, PatternRewriter &rewriter) const { int64_t srcRank = unpackOp.getSourceRank(); int64_t destRank = unpackOp.getDestRank(); ArrayRef srcShape = unpackOp.getSourceType().getShape(); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index f2c23c49a78e8..ae04c2b6b2a5b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1499,11 +1499,11 @@ vectorizeAsLinalgGeneric(RewriterBase &rewriter, VectorizationState &state, return success(); } -/// Given a tensor::PackOp, return the `dest` shape before any packing +/// Given a linalg::PackOp, return the `dest` shape before any packing /// permutations. -static SmallVector getTiledPackShape(tensor::PackOp packOp, +static SmallVector getTiledPackShape(linalg::PackOp packOp, ArrayRef destShape) { - return applyPermutation(destShape, tensor::getPackInverseDestPerm(packOp)); + return applyPermutation(destShape, linalg::getPackInverseDestPerm(packOp)); } /// Given an input, the mixed destSizes, and the vector sizes for vectorization, @@ -1558,7 +1558,7 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc, return write; } -/// Vectorize tensor::PackOp with (1) static innerTiles (2) constant +/// Vectorize linalg::PackOp with (1) static innerTiles (2) constant /// padding value and (3) input vector sizes into: /// masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds /// As in the following example: @@ -1585,7 +1585,7 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc, /// determined by the result tensor shape. Also, we update the inBounds /// attribute instead of masking. static LogicalResult -vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, +vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, ArrayRef inputVectorSizes, SmallVectorImpl &newResults) { // TODO: Introduce a parent class that will handle the insertion point update. @@ -1639,7 +1639,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, // Create TransposeOp. auto destPermutation = - invertPermutationVector(tensor::getPackInverseDestPerm(packOp)); + invertPermutationVector(getPackInverseDestPerm(packOp)); auto transposeOp = rewriter.create( loc, shapeCastOp.getResult(), destPermutation); @@ -1651,7 +1651,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, return success(); } -/// Vectorize a `tensor::UnPackOp` to these 4 Ops: +/// Vectorize a `linalg::UnPackOp` to these 4 Ops: /// Vector::TransferReadOp - Reads a vector from the source tensor /// vector::TransposeOp - Transpose the Source tensor /// ShapeCastOp - Reshape the data based on the target. @@ -1661,7 +1661,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, /// * the vector sizes are determined by the input operand and attributes, /// * update the inBounds attribute instead of masking. static LogicalResult -vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp, +vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, ArrayRef inputVectorSizes, SmallVectorImpl &newResults) { @@ -1754,7 +1754,7 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, tensor::UnPackOp unpackOp, PackingMetadata packMetadata; SmallVector lastDimToInsertPosPerm = - tensor::getUnPackInverseSrcPerm(unpackOp, packMetadata); + getUnPackInverseSrcPerm(unpackOp, packMetadata); ShapedType maskedOpShapedType = cast(readResult.getType()); SmallVector stripMineShape(maskedOpShapedType.getShape()); mlir::Type stripMineElemType = maskedOpShapedType.getElementType(); @@ -1887,7 +1887,7 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op, /// Need to check if the inner-tiles are static/constant. static LogicalResult -vectorizeUnPackOpPrecondition(tensor::UnPackOp unpackOp, +vectorizeUnPackOpPrecondition(linalg::UnPackOp unpackOp, ArrayRef inputVectorSizes) { if (llvm::any_of(unpackOp.getInnerTiles(), [](OpFoldResult res) { @@ -2007,7 +2007,7 @@ static LogicalResult vectorizeLinalgOpPrecondition( } static LogicalResult -vectorizePackOpPrecondition(tensor::PackOp packOp, +vectorizePackOpPrecondition(linalg::PackOp packOp, ArrayRef inputVectorSizes) { auto padValue = packOp.getPaddingValue(); Attribute cstAttr; @@ -2203,10 +2203,10 @@ LogicalResult mlir::linalg::vectorizeOpPrecondition( .Case([&](auto padOp) { return vectorizePadOpPrecondition(padOp, inputVectorSizes); }) - .Case([&](auto packOp) { + .Case([&](auto packOp) { return vectorizePackOpPrecondition(packOp, inputVectorSizes); }) - .Case([&](auto unpackOp) { + .Case([&](auto unpackOp) { return vectorizeUnPackOpPrecondition(unpackOp, inputVectorSizes); }) .Case([&](auto sliceOp) { @@ -2231,7 +2231,7 @@ static void convertAffineApply(RewriterBase &rewriter, LinalgOp linalgOp) { } bool mlir::linalg::hasVectorizationImpl(Operation *op) { - return isa(op); } @@ -2308,18 +2308,18 @@ LogicalResult mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, return vectorizeAsTensorPadOp(rewriter, padOp, inputVectorSizes, results); }) - .Case([&](auto packOp) { + .Case([&](auto packOp) { return vectorizeAsTensorPackOp(rewriter, packOp, inputVectorSizes, results); }) + .Case([&](auto unpackOp) { + return vectorizeAsTensorUnpackOp(rewriter, unpackOp, + inputVectorSizes, results); + }) .Case([&](auto sliceOp) { return vectorizeAsInsertSliceOp(rewriter, sliceOp, inputVectorSizes, results); }) - .Case([&](auto unpackOp) { - return vectorizeAsTensorUnpackOp(rewriter, unpackOp, - inputVectorSizes, results); - }) .Default([](auto) { return failure(); }); if (failed(vectorizeResult)) { diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index 23fdea1531964..99ec9886cd0e1 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -164,10 +164,64 @@ static void unpackRanges(OpBuilder &builder, Location loc, //===----------------------------------------------------------------------===// // General utilities //===----------------------------------------------------------------------===// +// +/// The permutation can be obtained from two permutations: +/// a) Compute the permutation vector to move the last `numPackedDims` into +/// the `innerPosDims` of a shape of rank `rank`. +/// b) Compute the permutation vector to move outer dims if the +/// `outerPerm` parameter is not empty. +/// Apply (b) permutation on (a) permutation to get the final permutation. +static SmallVector +computePackUnPackPerm(int64_t rank, ArrayRef &innerDimsPos, + ArrayRef &outerPerm, + PackingMetadata &packingMetadata) { + int64_t numPackedDims = innerDimsPos.size(); + auto lastDims = + llvm::to_vector(llvm::seq(rank - numPackedDims, rank)); + packingMetadata = computePackingMetadata(rank, innerDimsPos); + SmallVector innerPositionsPerm = + computePermutationVector(rank, lastDims, packingMetadata.insertPositions); + + SmallVector outerPos = packingMetadata.outerPositions; + if (!outerPerm.empty()) + applyPermutationToVector(outerPos, outerPerm); + SmallVector outerPositionPerm = + computePermutationVector(rank, packingMetadata.outerPositions, outerPos); + + SmallVector packInverseDestPermutation = innerPositionsPerm; + applyPermutationToVector(packInverseDestPermutation, outerPositionPerm); + return packInverseDestPermutation; +} namespace mlir { namespace linalg { +SmallVector getPackInverseDestPerm(PackOp packOp) { + + PackingMetadata pMetadata; + int64_t packedRank = packOp.getDestType().getRank(); + ArrayRef innerDimPos = packOp.getInnerDimsPos(); + ArrayRef outerPerm = packOp.getOuterDimsPerm(); + SmallVector packInvDestPerm = + computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata); + return packInvDestPerm; +} + +SmallVector getUnPackInverseSrcPerm(UnPackOp unpackOp) { + PackingMetadata metadata; + return getUnPackInverseSrcPerm(unpackOp, metadata); +} + +SmallVector getUnPackInverseSrcPerm(UnPackOp unpackOp, + PackingMetadata &metadata) { + int64_t unpackRank = unpackOp.getSourceType().getRank(); + ArrayRef innerDimPos = unpackOp.getInnerDimsPos(); + ArrayRef outerPerm = unpackOp.getOuterDimsPerm(); + SmallVector unpackInvSrcPerm = + computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata); + return unpackInvSrcPerm; +} + bool allIndexingsAreProjectedPermutation(LinalgOp op) { return llvm::all_of(op.getIndexingMapsArray(), [](AffineMap m) { return m.isProjectedPermutation(/*allowZeroInResults=*/true); diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index fa82bcb816a2a..bc1cb24303ad2 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -498,6 +498,20 @@ FailureOr mlir::loopUnrollByFactor( return resultLoops; } +/// Unrolls this loop completely. +LogicalResult mlir::loopUnrollFull(scf::ForOp forOp) { + IRRewriter rewriter(forOp.getContext()); + std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + if (!mayBeConstantTripCount.has_value()) + return failure(); + uint64_t tripCount = *mayBeConstantTripCount; + if (tripCount == 0) + return success(); + if (tripCount == 1) + return forOp.promoteIfSingleIteration(rewriter); + return loopUnrollByFactor(forOp, tripCount); +} + /// Check if bounds of all inner loops are defined outside of `forOp` /// and return false if not. static bool areInnerBoundsInvariant(scf::ForOp forOp) { diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp index 48be287ef833b..0cf5f0823be63 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp @@ -84,7 +84,11 @@ struct SPIRVInlinerInterface : public DialectInlinerInterface { // TODO: we need to filter OpKill here to avoid inlining it to // a loop continue construct: // https://github.com/KhronosGroup/SPIRV-Headers/issues/86 - // However OpKill is fragment shader specific and we don't support it yet. + // For now, we just disallow inlining OpKill anywhere in the code, + // but this restriction should be relaxed, as pointed above. + if (isa(op)) + return false; + return true; } diff --git a/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt b/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt index d9d09d6361a2f..5425615dac393 100644 --- a/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt @@ -16,7 +16,6 @@ add_mlir_dialect_library(MLIRTensorDialect DEPENDS MLIRTensorOpsIncGen - MLIRTensorInterfacesIncGen LINK_LIBS PUBLIC MLIRAffineDialect diff --git a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp index 002077753b132..8af087cbf0f61 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp @@ -63,7 +63,7 @@ void TensorDialect::initialize() { declarePromisedInterfaces(); declarePromisedInterface(); - declarePromisedInterfaces(); + declarePromisedInterfaces(); declarePromisedInterfaces(); } diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 03c2f3843f262..fad7db48b9872 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -10,6 +10,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Complex/IR/Complex.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/ReshapeOpsUtils.h" @@ -1156,20 +1157,6 @@ void EmptyOp::getCanonicalizationPatterns(RewritePatternSet &results, ReplaceEmptyTensorStaticShapeDims>(context); } -/// Try to remove a tensor operation if it would only reshape a constant. -/// Removes the op and replaces the constant with a new constant of the result -/// shape. When an optional cst attribute is passed, it is reshaped only if the -/// splat value matches the value in the attribute. -static OpFoldResult -reshapeConstantSource(DenseElementsAttr source, TensorType result, - std::optional cst = std::nullopt) { - if (source && source.isSplat() && result.hasStaticShape() && - (!cst.has_value() || source.getSplatValue() == cst.value())) - return source.resizeSplat(result); - - return {}; -} - //===----------------------------------------------------------------------===// // ExtractOp //===----------------------------------------------------------------------===// @@ -3885,916 +3872,6 @@ OpFoldResult SplatOp::fold(FoldAdaptor adaptor) { return SplatElementsAttr::get(getType(), {constOperand}); } -//===----------------------------------------------------------------------===// -// PackOp/UnPackOp Common -//===----------------------------------------------------------------------===// - -template -static LogicalResult -reifyResultShapesImpl(OpTy op, OpBuilder &builder, - ReifiedRankedShapedTypeDims &reifiedReturnShapes) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - int64_t destRank = op.getDestRank(); - reifiedReturnShapes.resize(1, SmallVector(destRank)); - reifiedReturnShapes[0] = - tensor::getMixedSizes(builder, op.getLoc(), op.getDest()); - return success(); -} - -template -static DenseMap getDimAndTileMappingImpl(OpTy op) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - DenseMap dimAndTileMapping; - ArrayRef dimsToTile = op.getInnerDimsPos(); - SmallVector tiles = op.getMixedTiles(); - assert(tiles.size() == dimsToTile.size() && - "tiles must match indices of dimension to block"); - // bind the dimension `i` with the tile factor. - for (auto i : llvm::seq(0, dimsToTile.size())) - dimAndTileMapping[dimsToTile[i]] = tiles[i]; - return dimAndTileMapping; -} - -template -static SmallVector getMixedTilesImpl(OpTy op) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - Builder builder(op); - SmallVector mixedInnerTiles; - unsigned dynamicValIndex = 0; - for (int64_t staticTile : op.getStaticInnerTiles()) { - if (!ShapedType::isDynamic(staticTile)) - mixedInnerTiles.push_back(builder.getI64IntegerAttr(staticTile)); - else - mixedInnerTiles.push_back(op.getInnerTiles()[dynamicValIndex++]); - } - return mixedInnerTiles; -} - -template -static SmallVector getStaticTilesImpl(OpTy op) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - SmallVector dynamicTiles; - SmallVector staticTiles; - dispatchIndexOpFoldResults(op.getMixedTiles(), dynamicTiles, staticTiles); - return staticTiles; -} - -/// Returns true if `dimsPos` is invalid. It is invalid when: -/// a) It contains duplicate. -/// b) At least one dimension is out of bound (`dimPos` is >= 0 and < rank). -/// c) The number of elements in `dimsPos` is > than `rank`. -static bool isInvalidPackingPosSpecification(ArrayRef dimsPos, - size_t rank) { - size_t dimsPosSize = dimsPos.size(); - if (dimsPosSize > rank) - return true; - DenseSet uniqued; - for (int64_t dim : dimsPos) - uniqued.insert(dim); - if (dimsPosSize != uniqued.size()) - return true; - return llvm::any_of(dimsPos, [rank](int64_t dimPos) { - return dimPos < 0 || dimPos >= static_cast(rank); - }); -} - -/// Returns true if the dimension of `sourceShape` is smaller than the dimension -/// of the `limitShape`. -static bool areAllInBound(ArrayRef sourceShape, - ArrayRef limitShape) { - assert( - sourceShape.size() == limitShape.size() && - "expected source shape rank, and limit of the shape to have same rank"); - return llvm::all_of( - llvm::zip(sourceShape, limitShape), [](std::tuple it) { - int64_t sourceExtent = std::get<0>(it); - int64_t limit = std::get<1>(it); - return ShapedType::isDynamic(sourceExtent) || - ShapedType::isDynamic(limit) || sourceExtent <= limit; - }); -} - -template -static LogicalResult commonVerifierPackAndUnPackOp(OpTy packOrUnPack) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - Operation *op = packOrUnPack.getOperation(); - - // Return true if we have a zero-value tile. - auto hasZeros = [&](ArrayRef tiles) { - return llvm::any_of( - tiles, [](OpFoldResult tile) { return isConstantIntValue(tile, 0); }); - }; - - // Verify tiles. Do not allow zero tiles. - SmallVector mixedTiles = packOrUnPack.getMixedTiles(); - if (hasZeros(mixedTiles)) - return op->emitError("invalid zero tile factor"); - - // Verify inner_dims_pos and outer_dims_perm. - RankedTensorType unpackedType = (std::is_same::value) - ? packOrUnPack.getSourceType() - : packOrUnPack.getDestType(); - size_t unpackedRank = unpackedType.getRank(); - ArrayRef innerDimsPos = packOrUnPack.getInnerDimsPos(); - ArrayRef outerDimPerm = packOrUnPack.getOuterDimsPerm(); - if (isInvalidPackingPosSpecification(innerDimsPos, unpackedRank)) - return op->emitError("invalid inner_dims_pos vector"); - if (isInvalidPackingPosSpecification(outerDimPerm, unpackedRank)) - return op->emitError("invalid outer_dims_perm vector"); - if (!outerDimPerm.empty() && outerDimPerm.size() != unpackedRank) - return op->emitError("outer_dims_perm must be a permutation or empty"); - - // Tiling factors must be less than or equal to the input rank for pack (or - // output rank for unpack), and must match the number of `inner_dims_pos`. - if (mixedTiles.size() > unpackedRank) { - return op->emitError("tiling factors must be less than or equal to the " - "input rank for pack or output rank for unpack"); - } - if (mixedTiles.size() != innerDimsPos.size()) { - return op->emitError( - "tiling factors must equal the number of dimensions to tile"); - } - - ShapedType packedType = (std::is_same::value) - ? packOrUnPack.getDestType() - : packOrUnPack.getSourceType(); - size_t packedRank = packedType.getRank(); - // Require output rank to match input rank + number of blocking factors. - size_t expectedPackedRank = unpackedRank + mixedTiles.size(); - if (expectedPackedRank != packedRank) { - return op->emitError( - "packed rank != (unpacked rank + num tiling factors), got ") - << packedRank << " != " << expectedPackedRank; - } - - // Verify result shape is greater than the minimum expected - // by the pack operation, and that the output shape - // represents full tiles. - RankedTensorType expectedPackedType = PackOp::inferPackedType( - unpackedType, packOrUnPack.getStaticTiles(), innerDimsPos, outerDimPerm); - if (!areAllInBound(expectedPackedType.getShape(), packedType.getShape())) { - return op->emitError("the shape of output is not large enough to hold the " - "packed data. Expected at least ") - << expectedPackedType << ", got " << packedType; - } - if (!llvm::all_of( - llvm::zip(packedType.getShape().take_back(mixedTiles.size()), - mixedTiles), - [](std::tuple it) { - int64_t shape = std::get<0>(it); - if (Attribute attr = - llvm::dyn_cast_if_present(std::get<1>(it))) { - IntegerAttr intAttr = dyn_cast_or_null(attr); - int64_t staticTileSize = intAttr.getValue().getSExtValue(); - return shape == staticTileSize; - } - return ShapedType::isDynamic(shape); - })) { - return op->emitError("mismatch in inner tile sizes specified and shaped of " - "tiled dimension in the packed type"); - } - return success(); -} - -namespace { -/// Subset of PackOp/UnPackOp fields used to compute the result of applying -/// various permutations to the op. -// TODO: Add linalg.transpose + pack/unpack folding patterns that just reuse -// these. These may or may not become true foldings / canonicalizations -// depending on how aggressive we want to be in automatically folding -// transposes. -struct PackOrUnPackTransposeResult { - SmallVector innerDimsPos; - SmallVector innerTiles; - SmallVector outerDimsPerm; -}; -} // namespace - -template -static PackOrUnPackTransposeResult -commonPermutationOfPackAndUnPackOp(OpTy packOrUnPackOp, - ArrayRef innerPermutation, - ArrayRef outerPermutation) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - assert((!innerPermutation.empty() || !outerPermutation.empty()) && - "some permutation must be non-empty"); - PackOrUnPackTransposeResult metadata; - metadata.innerDimsPos = - SmallVector(packOrUnPackOp.getInnerDimsPos()); - metadata.innerTiles = - SmallVector(packOrUnPackOp.getMixedTiles()); - int64_t numOuterDims = std::is_same::value - ? packOrUnPackOp.getSourceRank() - : packOrUnPackOp.getDestRank(); - metadata.outerDimsPerm = - packOrUnPackOp.getOuterDimsPerm().empty() - ? llvm::to_vector(llvm::seq(0, numOuterDims)) - : SmallVector(packOrUnPackOp.getOuterDimsPerm()); - if (!innerPermutation.empty()) { - assert(innerPermutation.size() == metadata.innerDimsPos.size() && - isPermutationVector(innerPermutation) && - "invalid inner permutation"); - applyPermutationToVector(metadata.innerDimsPos, innerPermutation); - applyPermutationToVector(metadata.innerTiles, innerPermutation); - } - if (!outerPermutation.empty()) { - assert(outerPermutation.size() == metadata.outerDimsPerm.size() && - isPermutationVector(outerPermutation) && - "invalid outer permutation"); - applyPermutationToVector(metadata.outerDimsPerm, outerPermutation); - } - return metadata; -} - -//===----------------------------------------------------------------------===// -// PackOp -//===----------------------------------------------------------------------===// - -void PackOp::getAsmResultNames(function_ref setNameFn) { - setNameFn(getResult(), "pack"); -} - -void PackOp::build(OpBuilder &builder, OperationState &state, Value source, - Value dest, ArrayRef innerDimsPos, - ArrayRef innerTiles, - std::optional paddingValue, - ArrayRef outerDimsPerm) { - assert(innerDimsPos.size() == innerTiles.size() && - "number of tile sizes specified must match the specified number of " - "original dimensions to be tiled"); - SmallVector staticTileSizes; - SmallVector dynamicTileSizes; - dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes); - build(builder, state, dest.getType(), source, dest, - paddingValue ? *paddingValue : nullptr, - outerDimsPerm.empty() ? nullptr - : builder.getDenseI64ArrayAttr(outerDimsPerm), - builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, - builder.getDenseI64ArrayAttr(staticTileSizes)); -} - -LogicalResult -PackOp::reifyResultShapes(OpBuilder &builder, - ReifiedRankedShapedTypeDims &reifiedReturnShapes) { - return reifyResultShapesImpl(*this, builder, reifiedReturnShapes); -} - -DenseMap PackOp::getDimAndTileMapping() { - return getDimAndTileMappingImpl(*this); -} - -SmallVector PackOp::getMixedTiles() { - return getMixedTilesImpl(*this); -} - -SmallVector PackOp::getStaticTiles() { - return getStaticTilesImpl(*this); -} - -ArrayRef PackOp::getAllOuterDims() { - ShapedType inputType = getSourceType(); - int64_t inputRank = inputType.getRank(); - return getDestType().getShape().take_front(inputRank); -} - -SmallVector PackOp::getTiledOuterDims() { - auto innerDimsPos = getInnerDimsPos(); - auto packedShape = getDestType().getShape(); - SmallVector res; - - for (auto index : innerDimsPos) - res.push_back(packedShape[index]); - - return res; -} - -bool PackOp::requirePaddingValue(ArrayRef inputShape, - ArrayRef innerDimsPos, - ArrayRef outputShape, - ArrayRef outerDimsPerm, - ArrayRef innerTiles) { - SmallVector outputTileSizes( - outputShape.take_front(inputShape.size())); - if (!outerDimsPerm.empty()) { - assert(outerDimsPerm.size() == outputTileSizes.size() && - "expected output and outer_dims_perm to have same size"); - applyPermutationToVector(outputTileSizes, - invertPermutationVector(outerDimsPerm)); - } - for (auto [pos, tileSize] : llvm::zip_equal(innerDimsPos, innerTiles)) { - if (ShapedType::isDynamic(inputShape[pos])) - continue; - std::optional constantTile = getConstantIntValue(tileSize); - - if (!constantTile) { - if (!ShapedType::isDynamic(outputTileSizes[pos]) && - (inputShape[pos] % outputTileSizes[pos] != 0)) - return true; - } else if (inputShape[pos] % (*constantTile) != 0) { - return true; - } - } - return false; -} - -LogicalResult PackOp::verify() { - if (failed(commonVerifierPackAndUnPackOp(*this))) - return failure(); - - // Verify padding value, and bail out if the tile does not divide the - // dimension fully. In the case of dynamic tile factors or dimensions, having - // a partial tile is undefined behavior. - auto paddingValue = getPaddingValue(); - if (paddingValue && - paddingValue.getType() != getSourceType().getElementType()) { - return emitOpError("expected padding_value has ") - << getSourceType().getElementType() - << " but got: " << paddingValue.getType(); - } - - if (!paddingValue && - requirePaddingValue(getSourceType().getShape(), getInnerDimsPos(), - getDestType().getShape(), getOuterDimsPerm(), - getMixedTiles())) { - return emitOpError( - "invalid tile factor or output size provided. Only full tiles are " - "supported when padding_value is not set"); - } - return success(); -} - -/// Converts OpFoldResults to int64_t shape entries, unconditionally mapping all -/// Value's to kDynamic, even if they are arith.constant values. -static SmallVector -asShapeWithAnyValueAsDynamic(ArrayRef ofrs) { - SmallVector result; - for (auto o : ofrs) { - // Have to do this first, as getConstantIntValue special-cases constants. - if (llvm::dyn_cast_if_present(o)) - result.push_back(ShapedType::kDynamic); - else - result.push_back(getConstantIntValue(o).value_or(ShapedType::kDynamic)); - } - return result; -} - -/// Helper for PackOp::{getResultShape,inferPackedType}. Returns the shape of -/// the packed type. Having a shared helper helps implement these two methods in -/// a way that ensures that they agree on which dimensions are dynamic. -static SmallVector getPackOpResultTypeShape( - ArrayRef sourceShape, ArrayRef innerTileSizes, - ArrayRef innerDimsPos, ArrayRef outerDimsPerm) { - SmallVector resultShape = llvm::to_vector(sourceShape); - for (auto tiledDim : llvm::enumerate(llvm::to_vector(innerDimsPos))) { - if (ShapedType::isDynamic(resultShape[tiledDim.value()])) - continue; - if (ShapedType::isDynamic(innerTileSizes[tiledDim.index()])) { - resultShape[tiledDim.value()] = ShapedType::kDynamic; - continue; - } - resultShape[tiledDim.value()] = divideCeilSigned( - resultShape[tiledDim.value()], innerTileSizes[tiledDim.index()]); - } - - // Swap tile loops if outer_dims_perm is available. - if (!outerDimsPerm.empty()) - applyPermutationToVector(resultShape, outerDimsPerm); - - // Append the inner tile dimensions. - resultShape.append(innerTileSizes.begin(), innerTileSizes.end()); - return resultShape; -} - -SmallVector PackOp::getResultShape( - OpBuilder &builder, Location loc, ArrayRef sourceDims, - ArrayRef innerTileSizes, ArrayRef innerDimsPos, - ArrayRef outerDimsPerm) { - SmallVector resultDims = llvm::to_vector(sourceDims); - - AffineExpr s0, s1; - bindSymbols(builder.getContext(), s0, s1); - AffineExpr ceilDivExpr = s0.ceilDiv(s1); - for (auto tiledDim : llvm::enumerate(llvm::to_vector(innerDimsPos))) { - resultDims[tiledDim.value()] = affine::makeComposedFoldedAffineApply( - builder, loc, ceilDivExpr, - {resultDims[tiledDim.value()], innerTileSizes[tiledDim.index()]}); - } - if (!outerDimsPerm.empty()) - applyPermutationToVector(resultDims, outerDimsPerm); - resultDims.append(innerTileSizes.begin(), innerTileSizes.end()); - - SmallVector resultTypeShape = - getPackOpResultTypeShape(asShapeWithAnyValueAsDynamic(sourceDims), - asShapeWithAnyValueAsDynamic(innerTileSizes), - innerDimsPos, outerDimsPerm); - - // Fix-up `resultDims` to ensure that they are Value's if and only if the - // result type shape says it's a dynamic dim. This is needed as callers may - // use dispatchIndexOpFoldResults on the result, and rely on exact number of - // dynamic dims returned by that. - for (unsigned i = 0; i < resultDims.size(); ++i) { - if (!ShapedType::isDynamic(resultTypeShape[i])) - continue; - resultDims[i] = - getValueOrCreateConstantIndexOp(builder, loc, resultDims[i]); - } - - return resultDims; -} - -/// Get the expected packed type based on source type, tile factors, position of -/// the inner tiles and permutation of the outer tiled loop. -RankedTensorType PackOp::inferPackedType(RankedTensorType sourceType, - ArrayRef innerTileSizes, - ArrayRef innerDimsPos, - ArrayRef outerDimsPerm) { - SmallVector resultShape = getPackOpResultTypeShape( - sourceType.getShape(), innerTileSizes, innerDimsPos, outerDimsPerm); - return RankedTensorType::get(resultShape, sourceType.getElementType()); -} - -Value PackOp::createDestinationTensor(OpBuilder &b, Location loc, Value source, - ArrayRef innerTileSizes, - ArrayRef innerDimsPos, - ArrayRef outerDimsPerm) { - AffineExpr dim0, dim1; - bindDims(b.getContext(), dim0, dim1); - auto ceilDiv = [&](OpFoldResult v1, OpFoldResult v2) -> OpFoldResult { - return affine::makeComposedFoldedAffineApply(b, loc, dim0.ceilDiv(dim1), - {v1, v2}); - }; - - SmallVector mixedSizes; - for (auto [index, value] : llvm::enumerate( - llvm::cast(source.getType()).getShape())) { - if (ShapedType::isDynamic(value)) - mixedSizes.push_back(b.create(loc, source, index).getResult()); - else - mixedSizes.push_back(b.getIndexAttr(value)); - } - for (auto it : llvm::zip(innerDimsPos, innerTileSizes)) { - int64_t dimPos = std::get<0>(it); - OpFoldResult tileSize = std::get<1>(it); - mixedSizes[dimPos] = ceilDiv(mixedSizes[dimPos], tileSize); - } - if (!outerDimsPerm.empty()) - applyPermutationToVector(mixedSizes, outerDimsPerm); - - mixedSizes.append(innerTileSizes.begin(), innerTileSizes.end()); - auto elemType = llvm::cast(source.getType()).getElementType(); - return b.create(loc, mixedSizes, elemType); -} - -PackOp PackOp::createTransposedClone(OpBuilder &b, Location loc, - ArrayRef innerPermutation, - ArrayRef outerPermutation) { - PackOrUnPackTransposeResult metadata = commonPermutationOfPackAndUnPackOp( - *this, innerPermutation, outerPermutation); - Value transposedDest = - createDestinationTensor(b, loc, getSource(), metadata.innerTiles, - metadata.innerDimsPos, metadata.outerDimsPerm); - return b.create(loc, getSource(), transposedDest, - metadata.innerDimsPos, metadata.innerTiles, - getPaddingValue(), metadata.outerDimsPerm); -} - -/// Returns true if the tiles and the tiled dims are constant. -template -bool areTilesAndTiledDimsAllConstant(OpTy op) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - ShapedType packedType = (std::is_same::value) - ? op.getDestType() - : op.getSourceType(); - SmallVector mixedTiles = op.getMixedTiles(); - for (auto [dimDest, tile] : llvm::zip( - packedType.getShape().take_back(mixedTiles.size()), mixedTiles)) { - std::optional constTileSize = getConstantIntValue(tile); - if (!constTileSize || ShapedType::isDynamic(dimDest)) - return false; - } - return true; -} - -Speculation::Speculatability PackOp::getSpeculatability() { - if (getPaddingValue()) - return Speculation::Speculatable; - - // The verifier rejects already operations if we can statically prove that the - // sizes of the tiles do not divide perfectly the dimension; thus, check only - // to have constant tiles and tiled inner dimensions. - if (!areTilesAndTiledDimsAllConstant(*this)) - return Speculation::NotSpeculatable; - - return Speculation::Speculatable; -} - -// Return true if `inner_dims_pos` and `outer_dims_perm` target the same -// dimensions for pack and unpack. -static bool hasSameInnerOuterAttribute(PackOp packOp, UnPackOp unPackOp) { - if (packOp.getInnerDimsPos() != unPackOp.getInnerDimsPos()) - return false; - if (packOp.getOuterDimsPerm() == unPackOp.getOuterDimsPerm()) - return true; - // Outer dims permutation is optional. - // To compare unbalanced pack-unpack pair, treat no permutation as equal to - // identity permutation. - return isIdentityPermutation(packOp.getOuterDimsPerm()) && - isIdentityPermutation(unPackOp.getOuterDimsPerm()); -} - -// Return true if pack and unpack have the same tiles. -// Same SSA values or same integer constants. -static bool haveSameTiles(PackOp packOp, UnPackOp unPackOp) { - auto packTiles = packOp.getMixedTiles(); - auto unPackTiles = unPackOp.getMixedTiles(); - if (packTiles.size() != unPackTiles.size()) - return false; - for (size_t i = 0, e = packTiles.size(); i < e; i++) { - if (!isEqualConstantIntOrValue(packTiles[i], unPackTiles[i])) - return false; - } - return true; -} - -/// Returns true if the pack op does not need a padding value. -static bool paddingIsNotNeeded(PackOp op) { - auto srcType = op.getSourceType(); - if (llvm::any_of(op.getInnerDimsPos(), - [&](int64_t pos) { return srcType.isDynamicDim(pos); })) - return false; - if (ShapedType::isDynamicShape(op.getStaticInnerTiles())) - return false; - return !PackOp::requirePaddingValue( - srcType.getShape(), op.getInnerDimsPos(), op.getDestType().getShape(), - op.getOuterDimsPerm(), op.getMixedTiles()); -} - -/// Returns true if the `srcShape` or `destShape` is different from the one in -/// `packOp` and populates each with the inferred static shape. -static bool inferStaticShape(PackOp packOp, SmallVectorImpl &srcShape, - SmallVectorImpl &destShape) { - bool changeNeeded = false; - srcShape.assign(packOp.getSourceType().getShape().begin(), - packOp.getSourceType().getShape().end()); - destShape.assign(packOp.getDestType().getShape().begin(), - packOp.getDestType().getShape().end()); - llvm::SmallSetVector innerDims; - innerDims.insert(packOp.getInnerDimsPos().begin(), - packOp.getInnerDimsPos().end()); - SmallVector inverseOuterDimsPerm; - if (!packOp.getOuterDimsPerm().empty()) - inverseOuterDimsPerm = invertPermutationVector(packOp.getOuterDimsPerm()); - int srcRank = packOp.getSourceRank(); - for (auto i : llvm::seq(0, srcRank)) { - if (innerDims.contains(i)) - continue; - int64_t srcPos = i; - int64_t destPos = i; - if (!inverseOuterDimsPerm.empty()) - destPos = inverseOuterDimsPerm[srcPos]; - if (ShapedType::isDynamic(srcShape[srcPos]) == - ShapedType::isDynamic(destShape[destPos])) { - continue; - } - int64_t size = srcShape[srcPos]; - if (ShapedType::isDynamic(size)) - size = destShape[destPos]; - srcShape[srcPos] = size; - destShape[destPos] = size; - changeNeeded = true; - } - return changeNeeded; -} - -LogicalResult PackOp::canonicalize(PackOp packOp, PatternRewriter &rewriter) { - // Fold an pack(unpack(x)) to x. - if (auto unPackOp = packOp.getSource().getDefiningOp()) { - if (unPackOp.getSourceType() != packOp.getDestType()) - return failure(); - if (packOp.getPaddingValue() || - !hasSameInnerOuterAttribute(packOp, unPackOp) || - !haveSameTiles(packOp, unPackOp)) - return failure(); - rewriter.replaceOp(packOp, unPackOp.getSource()); - return success(); - } - - // Fold optional PaddingValue operand away if padding is not needed. - if (packOp.getPaddingValue() && paddingIsNotNeeded(packOp)) { - rewriter.startOpModification(packOp); - packOp.getPaddingValueMutable().clear(); - rewriter.finalizeOpModification(packOp); - return success(); - } - - // Insert tensor.cast ops if static shape inference is available.. - SmallVector srcShape, destShape; - if (inferStaticShape(packOp, srcShape, destShape)) { - Location loc = packOp.getLoc(); - Value source = packOp.getSource(); - if (srcShape != packOp.getSourceType().getShape()) { - auto newSrcType = packOp.getSourceType().clone(srcShape); - source = - rewriter.create(loc, newSrcType, packOp.getSource()); - } - Value dest = packOp.getDest(); - RankedTensorType originalResultType = packOp.getDestType(); - bool needUpdateDestType = (destShape != originalResultType.getShape()); - if (needUpdateDestType) { - auto newDestType = packOp.getDestType().clone(destShape); - dest = - rewriter.create(loc, newDestType, packOp.getDest()); - } - rewriter.modifyOpInPlace(packOp, [&] { - packOp.getSourceMutable().assign(source); - packOp.getDestMutable().assign(dest); - packOp.getResult().setType(cast(dest.getType())); - }); - // Insert a cast if needed - if (needUpdateDestType) { - rewriter.setInsertionPointAfter(packOp); - auto castOp = - rewriter.create(loc, originalResultType, packOp); - rewriter.replaceAllUsesExcept(packOp, castOp, castOp); - } - return success(); - } - - return failure(); -} - -template -static bool isLikePadUnPad(PackOrUnpackOp packOp, - RankedTensorType packedTensorType) { - static_assert(std::is_same::value || - std::is_same::value, - "Function meant for pack/unpack"); - // This is a pad if packing only adds ones and we don't transpose dimensions. - - // Check that we are not transposing any dimensions. - ArrayRef innerDimsPos = packOp.getInnerDimsPos(); - int64_t numPackedDims = innerDimsPos.size(); - auto orderedDims = llvm::to_vector<4>(llvm::seq(0, numPackedDims)); - if (orderedDims != innerDimsPos) { - // Dimensions don't happen in order. - return false; - } - - ArrayRef packedShape = packedTensorType.getShape(); - int64_t packedRank = packedTensorType.getRank(); - // At this point we know that we are taking numPackedDims outer - // dimensions and pushing them all the way as the inner most dimensions. - // What's left on the outer most dimensions is, in this order: - // - the factor of the packed dimensions, then - // - the untouched dimensions - // This shifting inward of dimensions is a no-op (as opposed to a transpose) - // if all the dimensions that bubble outerward are ones. - // Therefore check that all the dimensions but the numPackedDims inner most - // ones are ones. - return llvm::all_of( - llvm::seq(0, packedRank - numPackedDims), - [&packedShape](int64_t i) { return packedShape[i] == 1; }); -} - -bool PackOp::isLikePad() { - auto packedTensorType = - llvm::cast((*this)->getResultTypes().front()); - return isLikePadUnPad(*this, packedTensorType); -} - -OpFoldResult PackOp::fold(FoldAdaptor adaptor) { - std::optional paddingValue; - if (auto pad = adaptor.getPaddingValue()) - paddingValue = pad; - if (OpFoldResult reshapedSource = reshapeConstantSource( - llvm::dyn_cast_if_present(adaptor.getSource()), - getDestType(), paddingValue)) - return reshapedSource; - return {}; -} - -//===----------------------------------------------------------------------===// -// UnPackOp -//===----------------------------------------------------------------------===// - -void UnPackOp::getAsmResultNames( - function_ref setNameFn) { - setNameFn(getResult(), "unpack"); -} - -LogicalResult -UnPackOp::reifyResultShapes(OpBuilder &builder, - ReifiedRankedShapedTypeDims &reifiedReturnShapes) { - return reifyResultShapesImpl(*this, builder, reifiedReturnShapes); -} - -DenseMap UnPackOp::getDimAndTileMapping() { - return getDimAndTileMappingImpl(*this); -} - -SmallVector UnPackOp::getMixedTiles() { - return getMixedTilesImpl(*this); -} - -SmallVector UnPackOp::getStaticTiles() { - return getStaticTilesImpl(*this); -} - -ArrayRef UnPackOp::getAllOuterDims() { - ShapedType destType = getDestType(); - int64_t destRank = destType.getRank(); - return getSourceType().getShape().take_front(destRank); -} - -SmallVector UnPackOp::getTiledOuterDims() { - auto innerDimsPos = getInnerDimsPos(); - auto packedShape = getSourceType().getShape(); - SmallVector res; - - for (auto index : innerDimsPos) - res.push_back(packedShape[index]); - - return res; -} - -LogicalResult UnPackOp::verify() { - return commonVerifierPackAndUnPackOp(*this); -} - -Speculation::Speculatability UnPackOp::getSpeculatability() { - // See PackOp::getSpeculatability. - if (!areTilesAndTiledDimsAllConstant(*this)) - return Speculation::NotSpeculatable; - - return Speculation::Speculatable; -} - -void UnPackOp::build(OpBuilder &builder, OperationState &state, Value source, - Value dest, ArrayRef innerDimsPos, - ArrayRef innerTiles, - ArrayRef outerDimsPerm) { - assert(innerDimsPos.size() == innerTiles.size() && - "number of tile sizes specified must match the specified number of " - "original dimensions to be tiled"); - SmallVector staticTileSizes; - SmallVector dynamicTileSizes; - dispatchIndexOpFoldResults(innerTiles, dynamicTileSizes, staticTileSizes); - build(builder, state, dest.getType(), source, dest, - outerDimsPerm.empty() ? nullptr - : builder.getDenseI64ArrayAttr(outerDimsPerm), - builder.getDenseI64ArrayAttr(innerDimsPos), dynamicTileSizes, - builder.getDenseI64ArrayAttr(staticTileSizes)); -} - -Value UnPackOp::createDestinationTensor(OpBuilder &b, Location loc, - Value source, - ArrayRef innerTileSizes, - ArrayRef innerDimsPos, - ArrayRef outerDimsPerm) { - AffineExpr sym0, sym1; - bindSymbols(b.getContext(), sym0, sym1); - auto dimMul = [&](OpFoldResult v1, OpFoldResult v2) -> OpFoldResult { - return affine::makeComposedFoldedAffineApply(b, loc, sym0 * sym1, {v1, v2}); - }; - - SmallVector mixedSizes; - auto srcType = llvm::cast(source.getType()); - for (auto i : - llvm::seq(0, srcType.getRank() - innerTileSizes.size())) { - if (srcType.isDynamicDim(i)) - mixedSizes.push_back(b.create(loc, source, i).getResult()); - else - mixedSizes.push_back(b.getIndexAttr(srcType.getDimSize(i))); - } - if (!outerDimsPerm.empty()) { - applyPermutationToVector( - mixedSizes, invertPermutationVector(outerDimsPerm)); - } - - for (auto [dimPos, tileSize] : llvm::zip_equal(innerDimsPos, innerTileSizes)) - mixedSizes[dimPos] = dimMul(mixedSizes[dimPos], tileSize); - - auto elemType = srcType.getElementType(); - return b.create(loc, mixedSizes, elemType); -} - -UnPackOp UnPackOp::createTransposedClone(OpBuilder &b, Location loc, - Value transposedSource, - ArrayRef innerPermutation, - ArrayRef outerPermutation) { - PackOrUnPackTransposeResult metadata = commonPermutationOfPackAndUnPackOp( - *this, innerPermutation, outerPermutation); - return b.create(loc, transposedSource, getDest(), - metadata.innerDimsPos, metadata.innerTiles, - metadata.outerDimsPerm); -} - -/// Returns true if the `srcShape` or `destShape` is different from the one in -/// `op` and populates each with the inferred static shape. -static bool inferStaticShape(UnPackOp op, SmallVectorImpl &srcShape, - SmallVectorImpl &destShape) { - bool changeNeeded = false; - srcShape.assign(op.getSourceType().getShape().begin(), - op.getSourceType().getShape().end()); - destShape.assign(op.getDestType().getShape().begin(), - op.getDestType().getShape().end()); - llvm::SmallSetVector innerDims; - innerDims.insert(op.getInnerDimsPos().begin(), op.getInnerDimsPos().end()); - SmallVector inverseOuterDimsPerm; - if (!op.getOuterDimsPerm().empty()) - inverseOuterDimsPerm = invertPermutationVector(op.getOuterDimsPerm()); - int destRank = op.getDestRank(); - for (auto i : llvm::seq(0, destRank)) { - if (innerDims.contains(i)) - continue; - int64_t srcPos = i; - int64_t destPos = i; - if (!inverseOuterDimsPerm.empty()) - srcPos = inverseOuterDimsPerm[destPos]; - if (ShapedType::isDynamic(srcShape[srcPos]) == - ShapedType::isDynamic(destShape[destPos])) { - continue; - } - int64_t size = srcShape[srcPos]; - if (ShapedType::isDynamic(size)) - size = destShape[destPos]; - srcShape[srcPos] = size; - destShape[destPos] = size; - changeNeeded = true; - } - return changeNeeded; -} - -LogicalResult UnPackOp::canonicalize(UnPackOp unPackOp, - PatternRewriter &rewriter) { - /// unpack(pack(x)) -> x - if (PackOp packOp = unPackOp.getSource().getDefiningOp()) { - if (packOp.getSourceType() != unPackOp.getDestType()) - return failure(); - if (packOp.getPaddingValue() || - !hasSameInnerOuterAttribute(packOp, unPackOp) || - !haveSameTiles(packOp, unPackOp)) - return failure(); - rewriter.replaceOp(unPackOp, packOp.getSource()); - return success(); - } - /// unpack(destinationStyleOp(x)) -> unpack(x) - if (auto dstStyleOp = - unPackOp.getDest().getDefiningOp()) { - auto destValue = cast(unPackOp.getDest()); - Value newDest = dstStyleOp.getDpsInits()[destValue.getResultNumber()]; - rewriter.modifyOpInPlace(unPackOp, - [&]() { unPackOp.setDpsInitOperand(0, newDest); }); - return success(); - } - - // Insert tensor.cast ops if static shape inference is available.. - SmallVector srcShape, destShape; - if (inferStaticShape(unPackOp, srcShape, destShape)) { - Location loc = unPackOp.getLoc(); - Value source = unPackOp.getSource(); - if (srcShape != unPackOp.getSourceType().getShape()) { - auto newSrcType = unPackOp.getSourceType().clone(srcShape); - source = rewriter.create(loc, newSrcType, - unPackOp.getSource()); - } - Value dest = unPackOp.getDest(); - if (destShape != unPackOp.getDestType().getShape()) { - auto newDestType = unPackOp.getDestType().clone(destShape); - dest = - rewriter.create(loc, newDestType, unPackOp.getDest()); - } - Value newOp = rewriter.create( - loc, source, dest, unPackOp.getInnerDimsPos(), unPackOp.getMixedTiles(), - unPackOp.getOuterDimsPerm()); - rewriter.replaceOpWithNewOp( - unPackOp, unPackOp.getResult().getType(), newOp); - return success(); - } - - return failure(); -} - -bool UnPackOp::isLikeUnPad() { - RankedTensorType packedTensorType = getSourceType(); - return isLikePadUnPad(*this, packedTensorType); -} - -OpFoldResult UnPackOp::fold(FoldAdaptor adaptor) { - if (OpFoldResult reshapedSource = reshapeConstantSource( - llvm::dyn_cast_if_present(adaptor.getSource()), - getResult().getType())) - return reshapedSource; - return {}; -} - //===----------------------------------------------------------------------===// // Common Canonicalizers and Folders. //===----------------------------------------------------------------------===// @@ -4809,151 +3886,6 @@ bool foldTensorCastPrecondition(DestinationStyleOpInterface op) { return hasFoldableTensorCastOperand(op); } -// Given the (potentially) updated packed type, `newPackedTy`, generates an -// updated mixed-tile-sizes attribute. A tile size is updated only -// when: -// * a dim from newPackedTy is static, and -// * the corresponding size from mixedTiles is still dynamic. -// Otherwise, the original tile size is preserved. -// Note - packed-type-dim and mixed-tile-size should always match! -static SmallVector -getNewMixedTileSizes(PatternRewriter &rewriter, Type newPackedTy, - SmallVector mixedTiles) { - SmallVector newMixedTileSizes; - for (auto it : llvm::zip(cast(newPackedTy) - .getShape() - .take_back(mixedTiles.size()), - mixedTiles)) { - int64_t shape = std::get<0>(it); - if (shape == ShapedType::kDynamic) { - newMixedTileSizes.push_back(std::get<1>(it)); - continue; - } - - // If the current result dim is static, update the dynamic mixed-size - // (provided the original value is dynamic). - OpFoldResult tile = std::get<1>(it); - if (Attribute attr = llvm::dyn_cast_if_present(tile)) { - // Already a constant - newMixedTileSizes.push_back(tile); - } else { - assert(getConstantIntValue(tile).value() == shape && - "tile size and dim size don't match!"); - newMixedTileSizes.push_back( - (rewriter.getIntegerAttr(rewriter.getIndexType(), shape))); - } - } - - return newMixedTileSizes; -} - -/// Folds a tensor.cast op into a consuming tensor::PackOp op if the -/// `tensor.cast` has source that is more static than the consuming op. -/// -/// Example: -/// ```mlir -/// %1 = tensor.cast %0 : tensor<8x16xf32> to tensor -/// %2 = tensor.pack %1 ... : tensor ... -/// ``` -/// -/// folds into: -/// -/// ```mlir -/// %2 = tensor.pack %0 ... : tensor<8x16xf32> ... -/// ``` -struct FoldTensorCastPackOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(PackOp op, - PatternRewriter &rewriter) const override { - if (!foldTensorCastPrecondition(op)) - return failure(); - - SmallVector newResultTypes(op->getResultTypes()); - SmallVector newOperands = - getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); - - // Get the updated mixed-tile-sizes attribute. - SmallVector newMixedTileSizes = - getNewMixedTileSizes(rewriter, newResultTypes[0], op.getMixedTiles()); - - // Clone op. - // TODO: Strictly speaking, discardable attributes should be _discarded_ at - // this point. However, in practice, we use them for things that we'd like - // to preserve. Implement a better abstraction. - PackOp newOp = rewriter.create( - op.getLoc(), newOperands[0], newOperands[1], op.getInnerDimsPos(), - newMixedTileSizes, op.getPaddingValue(), op.getOuterDimsPerm()); - newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); - - // Replace op. - Value oldResult = op.getResult(); - Value newResult = newOp.getResult(); - Value replacement = (newResult.getType() != oldResult.getType()) - ? rewriter.create( - op->getLoc(), oldResult.getType(), newResult) - : newResult; - - rewriter.replaceOp(op, {replacement}); - - return success(); - } -}; - -/// Folds a tensor.cast op into a consuming tensor::UnPackOp op if the -/// `tensor.cast` has source that is more static than the consuming op. -/// -/// Example: -/// ```mlir -/// %1 = tensor.cast %0 : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> -/// %2 = tensor.unpack %1 ... : tensor<1x1x?x1xi32> -> tensor<7x?xi32> -/// ``` -/// -/// folds into: -/// -/// ```mlir -/// %2 = tensor.unpack %0 ... tensor<1x1x8x1xi32> -> tensor<7x?xi32> -/// ``` -struct FoldTensorCastUnPackOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(UnPackOp op, - PatternRewriter &rewriter) const override { - if (!foldTensorCastPrecondition(op)) - return failure(); - - SmallVector newResultTypes(op->getResultTypes()); - SmallVector newOperands = - getUpdatedOperandsAfterCastOpFolding(op, newResultTypes); - Value sourceTensor = newOperands[0]; - - // Get the updated mixed-tile-sizes attribute. - SmallVector newMixedTileSizes = getNewMixedTileSizes( - rewriter, sourceTensor.getType(), op.getMixedTiles()); - - // Clone op. - // TODO: Strictly speaking, discardable attributes should be _discarded_ at - // this point. However, in practice, we use them for things that we'd like - // to preserve. Implement a better abstraction. - UnPackOp newOp = rewriter.create( - op.getLoc(), sourceTensor, newOperands[1], op.getInnerDimsPos(), - newMixedTileSizes, op.getOuterDimsPerm()); - newOp->setDiscardableAttrs(op->getDiscardableAttrDictionary()); - - // Replace op. - Value oldResult = op.getResult(); - Value newResult = newOp.getResult(); - Value replacement = (newResult.getType() != oldResult.getType()) - ? rewriter.create( - op->getLoc(), oldResult.getType(), newResult) - : newResult; - - rewriter.replaceOp(op, {replacement}); - - return success(); - } -}; - /// Folds a tensor.cast op into a consuming DestinationStyleOpInterface op if /// the `tensor.cast` has source that is more static than the consuming op. /// @@ -4978,9 +3910,10 @@ struct FoldTensorCastProducerOp LogicalResult matchAndRewrite(DestinationStyleOpInterface op, PatternRewriter &rewriter) const override { - // Reject tensor::PackOp - there's dedicated pattern for that instead. + // Reject PackOp/UnpackOp (i.e. RelayoutOps) - there are dedicated patterns + // for that instead. if (!foldTensorCastPrecondition(op) || - isa(*op)) + isa(*op)) return failure(); SmallVector newResultTypes(op->getResultTypes()); @@ -5013,8 +3946,6 @@ struct FoldTensorCastProducerOp void TensorDialect::getCanonicalizationPatterns( RewritePatternSet &results) const { - results.add(getContext()); - results.add(getContext()); results.add(getContext()); } diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp index 60dda39061085..7295f8771a1d5 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @@ -70,648 +70,6 @@ struct PadOpTiling : public TilingInterface::ExternalModel { } }; -template -static SmallVector getPackUnPackIterationDomain(OpTy op, - OpBuilder &builder) { - static_assert(llvm::is_one_of::value, - "applies to only pack or unpack operations"); - OpBuilder::InsertionGuard g(builder); - int64_t rank = (std::is_same::value) ? op.getSourceRank() - : op.getDestRank(); - OpFoldResult zero = builder.getIndexAttr(0); - OpFoldResult one = builder.getIndexAttr(1); - ReifiedRankedShapedTypeDims resultShape; - (void)reifyResultShapes(builder, op, resultShape); - SmallVector loopBounds(rank); - for (auto dim : llvm::seq(0, rank)) { - loopBounds[dim].offset = zero; - loopBounds[dim].stride = one; - loopBounds[dim].size = resultShape[0][dim]; - } - return loopBounds; -} - -static void applyPermToRange(SmallVector &offsets, - SmallVector &sizes, - ArrayRef permutation) { - if (permutation.empty()) - return; - applyPermutationToVector(offsets, permutation); - applyPermutationToVector(sizes, permutation); -} - -struct PackOpTiling - : public TilingInterface::ExternalModel { - - SmallVector getLoopIteratorTypes(Operation *op) const { - // Note that here we only consider untiled dimensions and outer tiled data - // dimensions, the inner tiled data dimensions are materialized when - // building the body of the operation. - auto packOp = cast(op); - SmallVector iteratorTypes( - packOp.getSourceRank(), utils::IteratorType::parallel); - return iteratorTypes; - } - - SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { - return getPackUnPackIterationDomain(cast(op), b); - } - - FailureOr - getTiledImplementation(Operation *op, OpBuilder &b, - ArrayRef offsets, - ArrayRef sizes) const { - auto packOp = cast(op); - Location loc = packOp.getLoc(); - - // The tiling is applied on interchanged dimensions. We have to undo the - // interchange to map sizes and offsets to the original input. - int64_t inputRank = packOp.getSourceRank(); - SmallVector origOffsets(offsets); - SmallVector origSizes(sizes); - applyPermToRange(origOffsets, origSizes, - invertPermutationVector(packOp.getOuterDimsPerm())); - - DenseMap dimAndTileMapping = - packOp.getDimAndTileMapping(); - SmallVector srcDimValues = - tensor::getMixedSizes(b, loc, packOp.getSource()); - SmallVector inputIndices, inputSizes; - for (auto dim : llvm::seq(0, inputRank)) { - using AV = affine::AffineValueExpr; - affine::AffineBuilder ab(b, loc); - AffineExpr dim0, dim1, sym; - bindDims(b.getContext(), dim0, dim1); - bindSymbols(b.getContext(), sym); - if (dimAndTileMapping.count(dim)) { - // If the data dimension is tiled, the i-th index is the product of - // offset_i and tile_i, and the i-th size is the product of sizes_i and - // tile_i. - auto avOffset = AV(dim0).bind(origOffsets[dim]); - auto avSize = AV(dim0).bind(origSizes[dim]); - auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); - inputIndices.push_back(ab.mul(avOffset, avTileSize)); - inputSizes.push_back(ab.mul(avSize, avTileSize)); - } else { - inputIndices.push_back(origOffsets[dim]); - inputSizes.push_back(origSizes[dim]); - } - - // Limit the size of the input operand for incomplete tiles. - if (packOp.getPaddingValue()) { - OpFoldResult dimSize = srcDimValues[dim]; - auto avDimSize = AV(dim0).bind(dimSize); - auto avInputIdx = AV(dim1).bind(inputIndices.back()); - inputSizes.back() = - ab.min({inputSizes.back(), ab.sub(avDimSize, avInputIdx)}); - } - } - - auto oneAttr = b.getI64IntegerAttr(1); - SmallVector strides(inputRank, oneAttr); - - SmallVector tiledOperands; - auto sourceSlice = b.create( - loc, packOp.getSource(), inputIndices, inputSizes, strides); - tiledOperands.push_back(sourceSlice); - - SmallVector outputOffsets, outputSizes; - if (failed(getResultTilePosition(op, b, 0, offsets, sizes, outputOffsets, - outputSizes))) - return {}; - - strides.append(packOp.getDestRank() - inputRank, oneAttr); - auto outSlice = b.create( - loc, packOp.getDest(), outputOffsets, outputSizes, strides); - tiledOperands.push_back(outSlice); - - if (auto val = packOp.getPaddingValue()) - tiledOperands.push_back(val); - for (auto tile : packOp.getInnerTiles()) - tiledOperands.push_back(tile); - - Operation *tiledPackOp = b.create( - loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); - - return TilingResult{ - {tiledPackOp}, - SmallVector(tiledPackOp->getResults()), - llvm::to_vector(ArrayRef{sourceSlice, outSlice})}; - } - - LogicalResult - getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, - ArrayRef offsets, - ArrayRef sizes, - SmallVector &resultOffsets, - SmallVector &resultSizes) const { - // The iteration domain is over outer dimensions of packed layout. In this - // context, the outer dimensions of `resultOffsets` are `offsets`. The - // inner dimensions of `resultOffsets` are zeros because tiling is not - // applied to them. - auto packOp = cast(op); - int64_t inputRank = packOp.getSourceRank(); - int64_t outputRank = packOp.getDestRank(); - auto zeroAttr = b.getI64IntegerAttr(0); - resultOffsets.assign(offsets.begin(), offsets.end()); - resultOffsets.append(outputRank - inputRank, zeroAttr); - - ReifiedRankedShapedTypeDims outputShape; - (void)reifyResultShapes(b, packOp, outputShape); - resultSizes.assign(sizes.begin(), sizes.end()); - for (auto dataTileDim : llvm::seq(inputRank, outputRank)) - resultSizes.push_back(outputShape[0][dataTileDim]); - - return success(); - } - - FailureOr - generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, - ArrayRef offsets, - ArrayRef sizes) const { - auto packOp = cast(op); - int64_t numTiles = packOp.getInnerDimsPos().size(); - - // tensor.pack op is fusible (as a producer) only if full inner tiles are - // iterated or inner dims are not tiled. Otherwise, it will generate a - // sequence of non-trivial ops (for partial tiles). - for (auto offset : offsets.take_back(numTiles)) - if (!isConstantIntValue(offset, 0)) - return failure(); - - for (auto iter : - llvm::zip_equal(packOp.getMixedTiles(), sizes.take_back(numTiles))) - if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) - return failure(); - - FailureOr tilingResult = getTiledImplementation( - op, b, offsets.drop_back(numTiles), sizes.drop_back(numTiles)); - if (failed(tilingResult)) - return failure(); - return tilingResult.value(); - } - - /// Method to return the position of iteration domain tile computed by the - /// tiled operation. In current `tensor.pack` context, the `resultOffsets` and - /// `resultSizes` only cover outer dimensions. - LogicalResult getIterationDomainTileFromOperandTile( - Operation *op, OpBuilder &b, unsigned operandNumber, - ArrayRef offsets, ArrayRef sizes, - SmallVectorImpl &resultOffsets, - SmallVectorImpl &resultSizes) const { - if (operandNumber != 0) - return failure(); - - auto packOp = cast(op); - // It is not trivial to infer dest tile from source tile if `packOp` has - // padding semantic. - if (packOp.getPaddingValue()) - return failure(); - - Location loc = packOp.getLoc(); - - SmallVector outerDimOffsets, outerDimSizes; - DenseMap dimAndTileMapping = - packOp.getDimAndTileMapping(); - for (auto dim : llvm::seq(packOp.getSourceRank())) { - if (dimAndTileMapping.count(dim)) { - FailureOr cstSize = - ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::UB, sizes[dim], - /*stopCondition=*/nullptr, /*closedUB=*/true); - std::optional cstInnerSize = - getConstantIntValue(dimAndTileMapping[dim]); - // Currently fusing `packOp` as consumer only expects perfect tiling - // scenario because even if without padding semantic, the `packOp` may - // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>, - // where the `tileSize` from operand of `packOp` is 5, which is not - // exactly divided by `innerTile`(=6) of `packOp`. As the result: - // 1. the first slice is extracted from (0) to (4) and inserted into - // (0,0)~(0,4) at first row. - // 2. the second slice is extracted from (5) to (9) and SHOULD BE - // respectively inserted into two rows with different length, including - // first row: (0,5) and second row (1,0)~(1,3). It is hard to coordinate - // them, thus adding below constraint to bypass them temporarily. In - // another word, we can only support tiling with consumer if the tile - // size for the producer is a multiple of the inner tile size for the - // packed dimensions at this moment. - if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) { - return failure(); - } - - using AV = affine::AffineValueExpr; - affine::AffineBuilder ab(b, loc); - AffineExpr dim0, sym; - bindDims(b.getContext(), dim0); - bindSymbols(b.getContext(), sym); - auto avOffset = AV(dim0).bind(offsets[dim]); - auto avSize = AV(dim0).bind(sizes[dim]); - auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]); - outerDimOffsets.push_back(ab.floor(avOffset, avTileSize)); - outerDimSizes.push_back(ab.ceil(avSize, avTileSize)); - } else { - outerDimOffsets.push_back(offsets[dim]); - outerDimSizes.push_back(sizes[dim]); - } - } - applyPermToRange(outerDimOffsets, outerDimSizes, packOp.getOuterDimsPerm()); - resultOffsets = outerDimOffsets; - resultSizes = outerDimSizes; - return success(); - } - - /// Method to return the tiled implementation of tensor.pack as a consumer. - FailureOr getTiledImplementationFromOperandTile( - Operation *op, OpBuilder &b, unsigned operandNumber, - ArrayRef offsets, ArrayRef sizes) const { - if (operandNumber != 0) - return failure(); - - auto packOp = cast(op); - Location loc = packOp.getLoc(); - - int64_t inputRank = packOp.getSourceRank(); - auto oneAttr = b.getI64IntegerAttr(1); - SmallVector strides(inputRank, oneAttr); - - SmallVector tiledOperands; - auto sourceSlice = b.create(loc, packOp.getSource(), - offsets, sizes, strides); - tiledOperands.push_back(sourceSlice); - - SmallVector outerDimOffsets, outerDimSizes; - if (failed(getIterationDomainTileFromOperandTile( - op, b, /*operandNumber=*/0, offsets, sizes, outerDimOffsets, - outerDimSizes))) - return failure(); - - SmallVector outputOffsets, outputSizes; - if (failed(getResultTilePosition(op, b, 0, outerDimOffsets, outerDimSizes, - outputOffsets, outputSizes))) - return failure(); - - strides.append(packOp.getDestRank() - inputRank, oneAttr); - auto outSlice = b.create( - loc, packOp.getDest(), outputOffsets, outputSizes, strides); - tiledOperands.push_back(outSlice); - - assert(!packOp.getPaddingValue() && "Expect no padding semantic"); - for (auto tile : packOp.getInnerTiles()) - tiledOperands.push_back(tile); - - Operation *tiledPackOp = b.create( - loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs()); - - return TilingResult{ - {tiledPackOp}, - SmallVector(tiledPackOp->getResults()), - llvm::to_vector(ArrayRef{sourceSlice, outSlice})}; - } -}; - -struct UnpackTileDimInfo { - bool isAlignedToInnerTileSize; - OpFoldResult sourceOffset; - OpFoldResult sourceSize; - OpFoldResult resultOffset; - OpFoldResult destExpandedSize; -}; - -/// Returns the needed information for tiling unpack op on `tileDim` with given -/// `tileOffset` and `tileSize`. For more details, see the comment of the -/// `getTiledImplementation`. -static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp, - int64_t tileDim, - OpFoldResult tileOffset, - OpFoldResult tileSize) { - UnpackTileDimInfo info; - Attribute zeroAttr = b.getIndexAttr(0); - Attribute oneAttr = b.getIndexAttr(1); - DenseMap dimAndTileMapping = - unpackOp.getDimAndTileMapping(); - // The dimension is not one of packed data dimension. - if (!dimAndTileMapping.count(tileDim)) { - info.isAlignedToInnerTileSize = true; - info.sourceOffset = tileOffset; - info.sourceSize = tileSize; - info.resultOffset = zeroAttr; - info.destExpandedSize = tileSize; - return info; - } - - Location loc = unpackOp.getLoc(); - using AV = affine::AffineValueExpr; - affine::AffineBuilder ab(b, loc); - AffineExpr dim0, dim1, sym0; - bindDims(b.getContext(), dim0, dim1); - bindSymbols(b.getContext(), sym0); - - OpFoldResult innerTileSize = dimAndTileMapping[tileDim]; - - info.isAlignedToInnerTileSize = false; - FailureOr cstSize = ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::UB, tileSize, - /*stopCondition=*/nullptr, /*closedUB=*/true); - std::optional cstInnerSize = getConstantIntValue(innerTileSize); - if (!failed(cstSize) && cstInnerSize) { - if (*cstSize % *cstInnerSize == 0) - info.isAlignedToInnerTileSize = true; - - // If the tiling size equals to the inner tiling size, the outer dims are - // always 1. - if (*cstInnerSize == *cstSize) { - auto lhs = AV(dim0).bind(tileOffset); - auto rhs = AV(dim1).bind(innerTileSize); - info.sourceOffset = ab.floor(lhs, rhs); - info.sourceSize = oneAttr; - info.resultOffset = zeroAttr; - info.destExpandedSize = tileSize; - return info; - } - } - - if (info.isAlignedToInnerTileSize) { - info.sourceOffset = - ab.floor(AV(dim0).bind(tileOffset), AV(dim1).bind(innerTileSize)); - info.resultOffset = zeroAttr; - info.destExpandedSize = tileSize; - - // The ceilDiv is needed here because there could be incomplete tile even - // it is perfect tiling cases. E.g., - // %0 = unpack tensor<33x2xf32> into tensor<64xf32> - // If the tiling size is 32, there will be 3 tiles. Two of them have - // size=32; one of them have size=2. The size is represented using - // affine_min op; we need ceilDiv. - info.sourceSize = - ab.ceil(AV(dim0).bind(tileSize), AV(dim1).bind(innerTileSize)); - return info; - } - - affine::DivModValue firstCoord = affine::getDivMod( - b, loc, getValueOrCreateConstantIndexOp(b, loc, tileOffset), - getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); - OpFoldResult tileExclusiveBound = - ab.add(AV(dim0).bind(tileOffset), AV(dim1).bind(tileSize)); - affine::DivModValue lastCoord = affine::getDivMod( - b, loc, - getValueOrCreateConstantIndexOp( - b, loc, - ab.sub(AV(dim0).bind(tileExclusiveBound), AV(dim1).bind(oneAttr))), - getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); - - OpFoldResult lengthMinusOne = ab.sub(AV(dim0).bind(lastCoord.quotient), - AV(dim1).bind(firstCoord.quotient)); - info.sourceSize = - ab.add(AV(dim0).bind(lengthMinusOne), AV(dim1).bind(oneAttr)); - info.sourceOffset = firstCoord.quotient; - info.resultOffset = firstCoord.remainder; - // Do not create an Affine ops for expanded size because the affine op is too - // complicated which would trigger an issue in affine ops simplification. - info.destExpandedSize = b.createOrFold( - loc, getValueOrCreateConstantIndexOp(b, loc, info.sourceSize), - getValueOrCreateConstantIndexOp(b, loc, innerTileSize)); - return info; -} - -struct UnPackOpTiling - : public TilingInterface::ExternalModel { - - SmallVector getLoopIteratorTypes(Operation *op) const { - auto unpackOp = cast(op); - SmallVector iteratorTypes( - unpackOp.getDestRank(), utils::IteratorType::parallel); - return iteratorTypes; - } - - SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { - return getPackUnPackIterationDomain(cast(op), b); - } - - /// There are two cases in tiling unpack ops. If the tiling size is aligned to - /// the inner tile size, the corresponding tiles of source are all complete. - /// Otherwise, there are in-complete tiles. We will need to expand the slice - /// of source for getting complete tiles. The tiled unpack op unpacks more - /// data from source, so We'll need an extract_slice op to shift and truncate - /// the output. - /// Take Nn_to_N as an example. Say that N=32, n=8, and tiling_size=15. The - /// coordinates of second tile (i.e., result[15..31]) are - /// [(1, 7), (2, 0,), (2, 1) ... (3, 6), (3, 7)]. The first row and the last - /// row are incomplete tiles. To represent the unpack op, we have to complete - /// the rows. I.e., the input coordinates would start with (1, 0); end with - /// (3, 7). In this context, the tiled unpack produces a (3 * n) elements - /// because there are 3 rows in total. Follow by a tensor.extract_slice op, we - /// can get the actual result. - FailureOr - getTiledImplementation(Operation *op, OpBuilder &b, - ArrayRef offsets, - ArrayRef sizes) const { - auto unpackOp = cast(op); - int64_t srcRank = unpackOp.getSourceRank(); - int64_t destRank = unpackOp.getDestRank(); - int64_t numInnerTiles = srcRank - destRank; - Location loc = unpackOp.getLoc(); - - // The perfect tiling case indicates that the tiling sizes are multiple of - // inner_tile_size. In this context, no extra data is needed when - // representing the tiled unpack op. - bool isPerfectTilingCase = true; - Attribute oneAttr = b.getIndexAttr(1); - SmallVector sliceSrcStrides(destRank, oneAttr); - SmallVector sliceSrcIndices, sliceSrcSizes; - SmallVector destExpandedSizes, resultOffsetsFromDest; - for (auto dim : llvm::seq(0, destRank)) { - UnpackTileDimInfo info = - getUnpackTileDimInfo(b, unpackOp, dim, offsets[dim], sizes[dim]); - if (!info.isAlignedToInnerTileSize) - isPerfectTilingCase = false; - sliceSrcIndices.push_back(info.sourceOffset); - sliceSrcSizes.push_back(info.sourceSize); - destExpandedSizes.push_back(info.destExpandedSize); - resultOffsetsFromDest.push_back(info.resultOffset); - } - - // The tiling is applied on destination dimensions. We have to apply the - // interchange on source dimensions if outer_dims_perm is set. - applyPermToRange(sliceSrcIndices, sliceSrcSizes, - unpackOp.getOuterDimsPerm()); - Attribute zeroAttr = b.getIndexAttr(0); - sliceSrcIndices.append(numInnerTiles, zeroAttr); - sliceSrcSizes.append(unpackOp.getMixedTiles()); - sliceSrcStrides.append(numInnerTiles, oneAttr); - SmallVector generatedSlices; - ExtractSliceOp sliceSource = - b.create(loc, unpackOp.getSource(), sliceSrcIndices, - sliceSrcSizes, sliceSrcStrides); - generatedSlices.push_back(sliceSource); - - SmallVector destStrides(destRank, oneAttr); - Value sliceDest; - if (isPerfectTilingCase) { - auto destSliceOp = b.create(loc, unpackOp.getDest(), - offsets, sizes, destStrides); - sliceDest = destSliceOp; - generatedSlices.push_back(destSliceOp); - } else { - sliceDest = b.create(loc, destExpandedSizes, - unpackOp.getDestType().getElementType()); - } - - SmallVector tiledOperands = {sliceSource.getResult(), sliceDest}; - for (auto tile : unpackOp.getInnerTiles()) - tiledOperands.push_back(tile); - - Operation *tiledUnpackOp = b.create( - loc, TypeRange{sliceDest.getType()}, tiledOperands, op->getAttrs()); - - if (isPerfectTilingCase) - return TilingResult{{tiledUnpackOp}, - SmallVector(tiledUnpackOp->getResults()), - generatedSlices}; - - auto extractSlice = - b.create(loc, tiledUnpackOp->getResult(0), - resultOffsetsFromDest, sizes, destStrides); - return TilingResult{ - {tiledUnpackOp}, {extractSlice.getResult()}, generatedSlices}; - } - - LogicalResult - getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, - ArrayRef offsets, - ArrayRef sizes, - SmallVector &resultOffsets, - SmallVector &resultSizes) const { - resultOffsets = llvm::to_vector(offsets); - resultSizes = llvm::to_vector(sizes); - return success(); - } - - FailureOr - generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, - ArrayRef offsets, - ArrayRef sizes) const { - FailureOr tilingResult = - getTiledImplementation(op, b, offsets, sizes); - if (failed(tilingResult)) - return failure(); - return tilingResult.value(); - } - - /// Method to return the position of iteration domain tile computed by the - /// tiled operation. - LogicalResult getIterationDomainTileFromOperandTile( - Operation *op, OpBuilder &b, unsigned operandNumber, - ArrayRef offsets, ArrayRef sizes, - SmallVectorImpl &resultOffsets, - SmallVectorImpl &resultSizes) const { - auto unPackOp = cast(op); - // If the operand tile is the dest, then no adjustment is needed. - if (operandNumber == unPackOp.getDestMutable().getOperandNumber()) { - resultOffsets = llvm::to_vector(offsets); - resultSizes = llvm::to_vector(sizes); - return success(); - } - Location loc = unPackOp.getLoc(); - - int64_t numTiles = unPackOp.getInnerDimsPos().size(); - auto destOffsets = offsets.drop_back(numTiles); - auto destSizes = sizes.drop_back(numTiles); - // The tiling is applied on interchanged dimensions. We have to undo the - // interchange to map sizes and offsets to the original input. - int64_t outputRank = unPackOp.getDestRank(); - ReifiedRankedShapedTypeDims reifiedReturnShapes; - if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes))) - return failure(); - SmallVector outputMixedSizes = reifiedReturnShapes.front(); - SmallVector origOffsets(destOffsets); - SmallVector origSizes(destSizes); - applyPermToRange(origOffsets, origSizes, - invertPermutationVector(unPackOp.getOuterDimsPerm())); - - DenseMap dimAndTileMapping = - unPackOp.getDimAndTileMapping(); - - for (auto dim : llvm::seq(0, outputRank)) { - using AV = affine::AffineValueExpr; - affine::AffineBuilder ab(b, loc); - AffineExpr dim0, dim1, sym0; - bindDims(b.getContext(), dim0, dim1); - bindSymbols(b.getContext(), sym0); - if (dimAndTileMapping.count(dim)) { - // If the data dimension is tiled, the i-th index is the product of - // offset_i and tile_i, and the i-th size is the product of sizes_i and - // tile_i. The sizes must be clamped to the sizes of the unpack result. - auto avOffset = AV(dim0).bind(origOffsets[dim]); - auto avSize = AV(dim0).bind(origSizes[dim]); - auto avTileSize = AV(sym0).bind(dimAndTileMapping[dim]); - auto avResultSize = AV(dim0).bind(outputMixedSizes[dim]); - resultOffsets.push_back(ab.mul(avOffset, avTileSize)); - auto avResultOffset = AV(dim1).bind(resultOffsets.back()); - resultSizes.push_back(ab.min({ab.mul(avSize, avTileSize), - ab.sub(avResultSize, avResultOffset)})); - } else { - resultOffsets.push_back(origOffsets[dim]); - resultSizes.push_back(origSizes[dim]); - } - } - return success(); - } - - /// Method to return the tiled implementation of tensor.unpack as a consumer. - FailureOr getTiledImplementationFromOperandTile( - Operation *op, OpBuilder &b, unsigned operandNumber, - ArrayRef offsets, ArrayRef sizes) const { - auto unPackOp = cast(op); - // tensor.unpack op is fusible (as a consumer) only if inner dims are not - // tiled. - int64_t numTiles = unPackOp.getInnerDimsPos().size(); - for (auto iter : - llvm::zip_equal(unPackOp.getMixedTiles(), sizes.take_back(numTiles))) { - if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter))) - return failure(); - } - - Location loc = unPackOp.getLoc(); - - // Fetch offset/size for creating the slice of the dest operand of - // unpack op. - SmallVector outputOffsets, outputSizes; - if (failed(getIterationDomainTileFromOperandTile( - op, b, /*operandNumber=*/0, offsets, sizes, outputOffsets, - outputSizes))) - return failure(); - - auto oneAttr = b.getI64IntegerAttr(1); - int64_t outputRank = unPackOp.getDestRank(); - SmallVector strides(outputRank, oneAttr); - - SmallVector tiledOperands; - // Create slice of the dest operand. - auto extractDestSlice = b.create( - loc, unPackOp.getDest(), outputOffsets, outputSizes, strides); - tiledOperands.push_back(extractDestSlice); - - SmallVector inputOffsets, inputSizes; - strides.append(unPackOp.getSourceRank() - outputRank, oneAttr); - // Create slice of the source operand. - auto extractSourceSlice = b.create( - loc, unPackOp.getSource(), offsets, sizes, strides); - tiledOperands.insert(tiledOperands.begin(), extractSourceSlice); - for (auto tile : unPackOp.getInnerTiles()) - tiledOperands.push_back(tile); - - // Create tiled unpack op. - Operation *tiledUnPackOp = - b.create(loc, TypeRange{extractDestSlice.getType()}, - tiledOperands, op->getAttrs()); - - return TilingResult{{tiledUnPackOp}, - SmallVector(tiledUnPackOp->getResults()), - llvm::to_vector(ArrayRef{ - extractSourceSlice, extractDestSlice})}; - } -}; - } // namespace FailureOr tensor::bubbleUpPadSlice(OpBuilder &b, @@ -932,15 +290,5 @@ void mlir::tensor::registerTilingInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) { tensor::PadOp::attachInterface(*ctx); - tensor::PackOp::attachInterface(*ctx); - tensor::UnPackOp::attachInterface(*ctx); - }); -} - -void mlir::tensor::registerTilingInterfaceExternalModelsForPackUnPackOps( - DialectRegistry ®istry) { - registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) { - tensor::PackOp::attachInterface(*ctx); - tensor::UnPackOp::attachInterface(*ctx); }); } diff --git a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp index 99199252710f9..f3560d08ff769 100644 --- a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp +++ b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp @@ -100,11 +100,6 @@ void transform::ApplyFoldTensorEmptyPatternsOp::populatePatterns( tensor::populateFoldTensorEmptyPatterns(patterns, getFoldSingleUseOnly()); } -void transform::ApplyFoldIntoPackAndUnpackPatternsOp::populatePatterns( - RewritePatternSet &patterns) { - tensor::populateFoldIntoPackAndUnpackPatterns(patterns); -} - void transform::ApplyFoldTensorSubsetOpsPatternsOp::populatePatterns( RewritePatternSet &patterns) { tensor::populateFoldTensorSubsetOpPatterns(patterns); diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt index cc6275fee671a..7880d1c5a0c5d 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt @@ -6,7 +6,6 @@ add_mlir_dialect_library(MLIRTensorTransforms FoldTensorSubsetOps.cpp IndependenceTransforms.cpp MergeConsecutiveInsertExtractSlicePatterns.cpp - PackAndUnpackPatterns.cpp ReshapePatterns.cpp RewriteAsConstant.cpp SwapExtractSliceWithProducerPatterns.cpp diff --git a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp index 60b0c3e759b6c..fa748cf01977f 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp @@ -93,49 +93,6 @@ struct FoldEmptyTensorWithExtractSliceOp bool foldSingleUseOnly = false; }; -/// tensor.empty does not define any tensor contents, so an unpadded pack -/// can be folded away. -struct FoldEmptyTensorWithPackOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(PackOp packOp, - PatternRewriter &rewriter) const override { - // Check for tensor.empty source. - auto emptyOp = packOp.getSource().getDefiningOp(); - if (!emptyOp) - return failure(); - - // Check for padding. - // Packing with padding cannot be simply removed. - if (packOp.getPaddingValue()) - return rewriter.notifyMatchFailure(packOp, "expects no padding value"); - - // Replace the pack directly with its destination. - rewriter.replaceOp(packOp, packOp.getDest()); - - return success(); - } -}; - -/// tensor.empty does not define any tensor contents, so an unpack -/// can be folded away. -struct FoldEmptyTensorWithUnPackOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(UnPackOp unPackOp, - PatternRewriter &rewriter) const override { - // Check for tensor.empty source. - auto emptyOp = unPackOp.getSource().getDefiningOp(); - if (!emptyOp) - return failure(); - - // Replace the unpack directly with its destination. - rewriter.replaceOp(unPackOp, unPackOp.getDest()); - - return success(); - } -}; - // Fold concat operation where all the operands are empty. struct FoldConcatsOfEmpty : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -176,7 +133,6 @@ void mlir::tensor::populateFoldTensorEmptyPatterns(RewritePatternSet &patterns, FoldEmptyTensorWithReshapeOp, FoldEmptyTensorWithReshapeOp>( patterns.getContext(), /*benefit=*/1, foldSingleUseOnly); - patterns.add(patterns.getContext(), - /*benefit=*/1); + patterns.add(patterns.getContext(), + /*benefit=*/1); } diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp index 5c16e538ac242..52462aae4bc80 100644 --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -92,61 +92,6 @@ mlir::tensor::computeTransposedType(RankedTensorType rankedTensorType, return transposedTensorType; } -/// The permutation can be obtained from two permutations: -/// a) Compute the permutation vector to move the last `numPackedDims` into -/// the `innerPosDims` of a shape of rank `rank`. -/// b) Compute the permutation vector to move outer dims if the -/// `outerPerm` parameter is not empty. -/// Apply (b) permutation on (a) permutation to get the final permutation. -static SmallVector -computePackUnPackPerm(int64_t rank, ArrayRef &innerDimsPos, - ArrayRef &outerPerm, - PackingMetadata &packingMetadata) { - int64_t numPackedDims = innerDimsPos.size(); - auto lastDims = - llvm::to_vector(llvm::seq(rank - numPackedDims, rank)); - packingMetadata = computePackingMetadata(rank, innerDimsPos); - SmallVector innerPositionsPerm = - computePermutationVector(rank, lastDims, packingMetadata.insertPositions); - - SmallVector outerPos = packingMetadata.outerPositions; - if (!outerPerm.empty()) - applyPermutationToVector(outerPos, outerPerm); - SmallVector outerPositionPerm = - computePermutationVector(rank, packingMetadata.outerPositions, outerPos); - - SmallVector packInverseDestPermutation = innerPositionsPerm; - applyPermutationToVector(packInverseDestPermutation, outerPositionPerm); - return packInverseDestPermutation; -} - -SmallVector mlir::tensor::getPackInverseDestPerm(PackOp packOp) { - - PackingMetadata pMetadata; - int64_t packedRank = packOp.getDestType().getRank(); - ArrayRef innerDimPos = packOp.getInnerDimsPos(); - ArrayRef outerPerm = packOp.getOuterDimsPerm(); - SmallVector packInvDestPerm = - computePackUnPackPerm(packedRank, innerDimPos, outerPerm, pMetadata); - return packInvDestPerm; -} - -SmallVector mlir::tensor::getUnPackInverseSrcPerm(UnPackOp unpackOp) { - PackingMetadata metadata; - return mlir::tensor::getUnPackInverseSrcPerm(unpackOp, metadata); -} - -SmallVector -mlir::tensor::getUnPackInverseSrcPerm(UnPackOp unpackOp, - PackingMetadata &metadata) { - int64_t unpackRank = unpackOp.getSourceType().getRank(); - ArrayRef innerDimPos = unpackOp.getInnerDimsPos(); - ArrayRef outerPerm = unpackOp.getOuterDimsPerm(); - SmallVector unpackInvSrcPerm = - computePackUnPackPerm(unpackRank, innerDimPos, outerPerm, metadata); - return unpackInvSrcPerm; -} - bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) { llvm::SmallBitVector droppedDims = op.getDroppedDims(); int64_t srcDim = 0; diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp index 70b2aaf9a17e0..0336423c57b1d 100644 --- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp +++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp @@ -483,3 +483,13 @@ PackingMetadata mlir::computePackingMetadata(int64_t packedRank, } return res; } + +OpFoldResult mlir::reshapeConstantSource(DenseElementsAttr source, + TensorType result, + std::optional cst) { + if (source && source.isSplat() && result.hasStaticShape() && + (!cst.has_value() || source.getSplatValue() == cst.value())) + return source.resizeSplat(result); + + return {}; +} diff --git a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp index 260ac9ce589a3..f1fbb39b97fc4 100644 --- a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp +++ b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp @@ -131,6 +131,39 @@ struct DotBF16OpConversion : public ConvertOpToLLVMPattern { } }; +struct CvtPackedF32ToBF16Conversion + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(CvtPackedF32ToBF16Op op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto typeA = dyn_cast(op.getA().getType()); + unsigned elemBitWidth = typeA.getElementTypeBitWidth(); + unsigned opBitWidth = typeA.getShape()[0] * elemBitWidth; + + auto opType = op.getDst().getType(); + auto opA = op.getA(); + + switch (opBitWidth) { + case 256: { + rewriter.replaceOpWithNewOp(op, opType, opA); + break; + } + case 512: { + rewriter.replaceOpWithNewOp(op, opType, opA); + break; + } + default: { + return rewriter.notifyMatchFailure( + op, "unsupported AVX512-BF16 packed f32 to bf16 variant"); + } + } + + return success(); + } +}; + struct RsqrtOpConversion : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -202,8 +235,10 @@ using Registry = RegistryImpl< void mlir::populateX86VectorLegalizeForLLVMExportPatterns( const LLVMTypeConverter &converter, RewritePatternSet &patterns) { Registry::registerPatterns(converter, patterns); - patterns.add(converter); + patterns + .add( + converter); } void mlir::configureX86VectorLegalizeForExportTarget( @@ -215,6 +250,9 @@ void mlir::configureX86VectorLegalizeForExportTarget( target.addLegalOp(); target.addLegalOp(); target.addIllegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addIllegalOp(); target.addLegalOp(); target.addIllegalOp(); target.addLegalOp(); diff --git a/mlir/lib/Reducer/ReductionTreePass.cpp b/mlir/lib/Reducer/ReductionTreePass.cpp index 2d2744bfc2732..ef32adbab5577 100644 --- a/mlir/lib/Reducer/ReductionTreePass.cpp +++ b/mlir/lib/Reducer/ReductionTreePass.cpp @@ -56,13 +56,14 @@ static void applyPatterns(Region ®ion, opsInRange.push_back(&op.value()); } - // `applyOpPatternsAndFold` may erase the ops so we can't do the pattern - // matching in above iteration. Besides, erase op not-in-range may end up in - // invalid module, so `applyOpPatternsAndFold` should come before that - // transform. + // `applyOpPatternsGreedily` with folding may erase the ops so we can't do the + // pattern matching in above iteration. Besides, erase op not-in-range may end + // up in invalid module, so `applyOpPatternsGreedily` with folding should come + // before that transform. for (Operation *op : opsInRange) { - // `applyOpPatternsAndFold` returns whether the op is convered. Omit it - // because we don't have expectation this reduction will be success or not. + // `applyOpPatternsGreedily` with folding returns whether the op is + // convered. Omit it because we don't have expectation this reduction will + // be success or not. GreedyRewriteConfig config; config.strictMode = GreedyRewriteStrictness::ExistingOps; (void)applyOpPatternsGreedily(op, patterns, config); diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp index e240a7ae4917f..fa8c597da58b1 100644 --- a/mlir/lib/Target/LLVM/NVVM/Target.cpp +++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp @@ -321,6 +321,25 @@ std::optional NVPTXSerializer::findTool(StringRef tool) { return std::nullopt; } +/// Adds optional command-line arguments to existing arguments. +template +static void setOptionalCommandlineArguments(NVVMTargetAttr target, + SmallVectorImpl &ptxasArgs) { + if (!target.hasCmdOptions()) + return; + + std::optional cmdOptions = target.getCmdOptions(); + for (Attribute attr : cast(cmdOptions->getValue())) { + if (auto strAttr = dyn_cast(attr)) { + if constexpr (std::is_same_v) { + ptxasArgs.push_back(strAttr.getValue()); + } else if constexpr (std::is_same_v) { + ptxasArgs.push_back(strAttr.getValue().data()); + } + } + } +} + // TODO: clean this method & have a generic tool driver or never emit binaries // with this mechanism and let another stage take care of it. std::optional> @@ -359,8 +378,8 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) { return std::nullopt; TmpFile cubinFile; if (createFatbin) { - Twine cubinFilename = ptxFile->first + ".cubin"; - cubinFile = TmpFile(cubinFilename.str(), llvm::FileRemover(cubinFilename)); + std::string cubinFilename = (ptxFile->first + ".cubin").str(); + cubinFile = TmpFile(cubinFilename, llvm::FileRemover(cubinFilename)); } else { cubinFile.first = binaryFile->first; } @@ -412,6 +431,9 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) { useFatbin32 = true; } + // Set optional command line arguments + setOptionalCommandlineArguments(getTarget(), ptxasArgs); + // Create the `fatbinary` args. StringRef chip = getTarget().getChip(); // Remove the arch prefix to obtain the compute capability. @@ -562,6 +584,8 @@ NVPTXSerializer::compileToBinaryNVPTX(const std::string &ptxCode) { cmdOpts.second.append( {"-arch", getTarget().getChip().data(), "--opt-level", optLevel.c_str()}); + // Set optional command line arguments + setOptionalCommandlineArguments(getTarget(), cmdOpts.second); // Create the compiler handle. RETURN_ON_NVPTXCOMPILER_ERROR( nvPTXCompilerCreate(&compiler, ptxCode.size(), ptxCode.c_str())); diff --git a/mlir/test/Dialect/GPU/nvvm-attach-target.mlir b/mlir/test/Dialect/GPU/nvvm-attach-target.mlir new file mode 100644 index 0000000000000..35450e0ad6b1b --- /dev/null +++ b/mlir/test/Dialect/GPU/nvvm-attach-target.mlir @@ -0,0 +1,15 @@ +// RUN: mlir-opt %s --nvvm-attach-target="" | FileCheck %s +// RUN: mlir-opt %s --nvvm-attach-target="ptxas-cmd-options=--register-usage-level=8" | FileCheck %s -check-prefix=CHECK-OPTIONS + +module attributes {gpu.container_module} { + // CHECK-LABEL:gpu.module @kernel_module1 + // CHECK: [#nvvm.target] + // CHECK-OPTIONS: [#nvvm.target] + gpu.module @kernel_module1 { + llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, + %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, + %arg5: i64) attributes {gpu.kernel} { + llvm.return + } + } +} diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir index 01ca4374da046..4ba4b09f52163 100644 --- a/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir +++ b/mlir/test/Dialect/Linalg/block-pack-matmul-layout.mlir @@ -38,64 +38,64 @@ func.func @block_matmul_transpose_b( // MMT4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> // MMT4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> // MMT4D-LABEL: func @block_matmul -// MMT4D-COUNT-3: tensor.pack +// MMT4D-COUNT-3: linalg.pack // MMT4D: linalg.generic // MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MMT4D-COUNT-1: tensor.unpack +// MMT4D-COUNT-1: linalg.unpack // MMT4D-LABEL: func @block_matmul_transpose_a -// MMT4D-COUNT-3: tensor.pack +// MMT4D-COUNT-3: linalg.pack // MMT4D: linalg.generic // MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MMT4D-COUNT-1: tensor.unpack +// MMT4D-COUNT-1: linalg.unpack // MMT4D-LABEL: func @block_matmul_transpose_b -// MMT4D-COUNT-3: tensor.pack +// MMT4D-COUNT-3: linalg.pack // MMT4D: linalg.generic // MMT4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MMT4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MMT4D-COUNT-1: tensor.unpack +// MMT4D-COUNT-1: linalg.unpack // MM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> // MM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)> // MM4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> // MM4D-LABEL: func @block_matmul -// MM4D-COUNT-3: tensor.pack +// MM4D-COUNT-3: linalg.pack // MM4D: linalg.generic // MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MM4D-COUNT-1: tensor.unpack +// MM4D-COUNT-1: linalg.unpack // MM4D-LABEL: func @block_matmul_transpose_a -// MM4D-COUNT-3: tensor.pack +// MM4D-COUNT-3: linalg.pack // MM4D: linalg.generic // MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MM4D-COUNT-1: tensor.unpack +// MM4D-COUNT-1: linalg.unpack // MM4D-LABEL: func @block_matmul_transpose_b -// MM4D-COUNT-3: tensor.pack +// MM4D-COUNT-3: linalg.pack // MM4D: linalg.generic // MM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MM4D-COUNT-1: tensor.unpack +// MM4D-COUNT-1: linalg.unpack // MTM4D-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d5, d3)> // MTM4D-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d5, d4)> // MTM4D-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> // MTM4D-LABEL: func @block_matmul -// MTM4D-COUNT-3: tensor.pack +// MTM4D-COUNT-3: linalg.pack // MTM4D: linalg.generic // MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MTM4D-COUNT-1: tensor.unpack +// MTM4D-COUNT-1: linalg.unpack // MTM4D-LABEL: func @block_matmul_transpose_a -// MTM4D-COUNT-3: tensor.pack +// MTM4D-COUNT-3: linalg.pack // MTM4D: linalg.generic // MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MTM4D-COUNT-1: tensor.unpack +// MTM4D-COUNT-1: linalg.unpack // MTM4D-LABEL: func @block_matmul_transpose_b -// MTM4D-COUNT-3: tensor.pack +// MTM4D-COUNT-3: linalg.pack // MTM4D: linalg.generic // MTM4D-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // MTM4D-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] -// MTM4D-COUNT-1: tensor.unpack +// MTM4D-COUNT-1: linalg.unpack diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir index 9e396ba08d246..e667879ceea0e 100644 --- a/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir +++ b/mlir/test/Dialect/Linalg/block-pack-matmul-padding.mlir @@ -21,17 +21,17 @@ func.func @block_matmul_padding( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32> // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<123x125xf32> -> tensor<4x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<125x124xf32> -> tensor<8x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<123x124xf32> -> tensor<4x8x32x16xf32> @@ -39,17 +39,17 @@ func.func @block_matmul_padding( // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<123x124xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<123x124xf32> // NOPAD-LABEL: func @block_matmul_padding( // NOPAD-SAME: %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32> -// NOPAD-NOT: tensor.pack +// NOPAD-NOT: linalg.pack // NOPAD: linalg.matmul ins(%[[A]], %[[B]] : tensor<123x125xf32>, tensor<125x124xf32>) // NOPAD-SAME: outs(%[[C]] : tensor<123x124xf32>) -> tensor<123x124xf32> -// NOPAD-NOT: tensor.unpack +// NOPAD-NOT: linalg.unpack // PAD-MULT-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> // PAD-MULT-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> @@ -58,17 +58,17 @@ func.func @block_matmul_padding( // PAD-MULT-SAME: %[[A:[0-9a-z]+]]: tensor<123x125xf32>, %[[B:[0-9a-z]+]]: tensor<125x124xf32>, %[[C:[0-9a-z]+]]: tensor<123x124xf32> // PAD-MULT-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32 // PAD-MULT: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<1x1x256x384xf32> -// PAD-MULT: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// PAD-MULT: %[[A_PACKED:.+]] = linalg.pack %[[A]] // PAD-MULT-SAME: padding_value(%[[ZERO]] : f32) // PAD-MULT-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [256, 384] // PAD-MULT-SAME: into %[[PACK_DST_0]] : tensor<123x125xf32> -> tensor<1x1x256x384xf32> // PAD-MULT: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<1x1x512x384xf32> -// PAD-MULT: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// PAD-MULT: %[[B_PACKED:.+]] = linalg.pack %[[B]] // PAD-MULT-SAME: padding_value(%[[ZERO]] : f32) // PAD-MULT-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [512, 384] // PAD-MULT-SAME: into %[[PACK_DST_1]] : tensor<125x124xf32> -> tensor<1x1x512x384xf32> // PAD-MULT: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<1x1x256x512xf32> -// PAD-MULT: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// PAD-MULT: %[[C_PACKED:.+]] = linalg.pack %[[C]] // PAD-MULT-SAME: padding_value(%[[ZERO]] : f32) // PAD-MULT-SAME: inner_dims_pos = [0, 1] inner_tiles = [256, 512] // PAD-MULT-SAME: into %[[PACK_DST_2]] : tensor<123x124xf32> -> tensor<1x1x256x512xf32> @@ -76,7 +76,7 @@ func.func @block_matmul_padding( // PAD-MULT-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // PAD-MULT-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // PAD-MULT-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<1x1x256x384xf32>, tensor<1x1x512x384xf32>) outs(%[[C_PACKED]] : tensor<1x1x256x512xf32>) -// PAD-MULT: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// PAD-MULT: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // PAD-MULT-SAME: inner_dims_pos = [0, 1] inner_tiles = [256, 512] // PAD-MULT-SAME: into %[[C]] : tensor<1x1x256x512xf32> -> tensor<123x124xf32> // PAD-MULT: return %[[RES_UNPACKED]] : tensor<123x124xf32> diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir index 8a82608177692..aa860dbd581a9 100644 --- a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir +++ b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir @@ -14,22 +14,22 @@ func.func @block_matmul( // CHECK-LABEL: func @block_matmul( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>, %[[C:[0-9a-z]+]]: tensor<128x128xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x128xf32> -> tensor<4x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x128xf32> -> tensor<8x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<128x128xf32> -> tensor<4x8x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32> @@ -60,7 +60,7 @@ func.func @block_matmul_dynamic( // CHECK-DAG: %[[A_OUTER_TILE_M:.+]] = affine.apply #[[$MAP_M]]()[%[[A_M]]] // CHECK-DAG: %[[A_OUTER_TILE_K:.+]] = affine.apply #[[$MAP_K]]()[%[[A_K]]] // CHECK: %[[PACK_DST_0:.+]] = tensor.empty(%[[A_OUTER_TILE_M]], %[[A_OUTER_TILE_K]]) : tensor -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor -> tensor @@ -69,7 +69,7 @@ func.func @block_matmul_dynamic( // CHECK-DAG: %[[B_OUTER_TILE_K:.+]] = affine.apply #[[$MAP_K]]()[%[[B_K]]] // CHECK-DAG: %[[B_OUTER_TILE_N:.+]] = affine.apply #[[$MAP_N]]()[%[[B_N]]] // CHECK: %[[PACK_DST_1:.+]] = tensor.empty(%[[B_OUTER_TILE_N]], %[[B_OUTER_TILE_K]]) : tensor -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor -> tensor @@ -78,7 +78,7 @@ func.func @block_matmul_dynamic( // CHECK-DAG: %[[C_OUTER_TILE_M:.+]] = affine.apply #[[$MAP_M]]()[%[[C_M]]] // CHECK-DAG: %[[C_OUTER_TILE_N:.+]] = affine.apply #[[$MAP_N]]()[%[[C_N]]] // CHECK: %[[PACK_DST_2:.+]] = tensor.empty(%[[C_OUTER_TILE_M]], %[[C_OUTER_TILE_N]]) : tensor -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: padding_value(%[[ZERO]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor -> tensor @@ -86,7 +86,7 @@ func.func @block_matmul_dynamic( // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor, tensor) outs(%[[C_PACKED]] : tensor) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor -> tensor // CHECK: return %[[RES_UNPACKED]] : tensor @@ -107,7 +107,7 @@ func.func @block_matmul_with_constant( // CHECK-DAG: %[[RES_DST:.+]] = arith.constant dense<0.000000e+00> : tensor<128x128xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: ins({{.*}} : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[CST_ACC_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[RES_DST]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32> @@ -130,7 +130,7 @@ func.func @block_matmul_with_producer( // CHECK: %[[ACC_PACKED:.+]] = linalg.fill ins(%[[C0]] : f32) outs(%[[FILL_DST_PACKED]] : tensor<4x8x32x16xf32>) -> tensor<4x8x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: ins({{.*}} : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[ACC_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32> @@ -152,7 +152,7 @@ func.func @block_matmul_with_consumer( // CHECK-DAG: %[[RES_DST:.+]] = tensor.empty() : tensor<128x128xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: outs({{.*}} : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: %[[ADD_RES:.+]] = linalg.add @@ -175,22 +175,22 @@ func.func @block_batch_matmul( // CHECK-LABEL: func @block_batch_matmul( // CHECK-SAME: %[[A:.+]]: tensor<512x64x128xf32>, %[[B:.+]]: tensor<512x128x64xf32>, %[[C:.+]]: tensor<512x64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x64x128xf32> -> tensor<512x2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x128x64xf32> -> tensor<512x4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32> @@ -211,22 +211,22 @@ func.func @block_matmul_transpose_a( // CHECK-LABEL: func @block_matmul_transpose_a( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x64xf32>, %[[B:[0-9a-z]+]]: tensor<128x64xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x64xf32> -> tensor<2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x64xf32> -> tensor<4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> @@ -247,22 +247,22 @@ func.func @block_batch_matmul_transpose_a( // CHECK-LABEL: func @block_batch_matmul_transpose_a( // CHECK-SAME: %[[A:.+]]: tensor<512x128x64xf32>, %[[B:.+]]: tensor<512x128x64xf32>, %[[C:.+]]: tensor<512x64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x128x64xf32> -> tensor<512x2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x128x64xf32> -> tensor<512x4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32> @@ -283,22 +283,22 @@ func.func @block_matmul_transpose_b( // CHECK-LABEL: func @block_matmul_transpose_b( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<64x128xf32>, %[[B:[0-9a-z]+]]: tensor<64x128xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<64x128xf32> -> tensor<2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<64x128xf32> -> tensor<4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> @@ -319,22 +319,22 @@ func.func @block_batch_matmul_transpose_b( // CHECK-LABEL: func @block_batch_matmul_transpose_b( // CHECK-SAME: %[[A:.+]]: tensor<512x64x128xf32>, %[[B:.+]]: tensor<512x64x128xf32>, %[[C:.+]]: tensor<512x64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<512x2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<512x64x128xf32> -> tensor<512x2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<512x4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<512x64x128xf32> -> tensor<512x4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<512x2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<512x64x64xf32> -> tensor<512x2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<512x2x2x32x64xf32>, tensor<512x4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<512x2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [1, 2] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<512x2x4x32x16xf32> -> tensor<512x64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<512x64x64xf32> @@ -365,22 +365,22 @@ func.func @block_generic_matmul( // CHECK-LABEL: func @block_generic_matmul( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x128xf32>, %[[B:[0-9a-z]+]]: tensor<128x128xf32>, %[[C:[0-9a-z]+]]: tensor<128x128xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<4x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x128xf32> -> tensor<4x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<8x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x128xf32> -> tensor<8x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<4x8x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<128x128xf32> -> tensor<4x8x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<4x2x32x64xf32>, tensor<8x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<4x8x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<4x8x32x16xf32> -> tensor<128x128xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<128x128xf32> @@ -411,22 +411,22 @@ func.func @block_generic_matmul_transpose_a( // CHECK-LABEL: func @block_generic_matmul_transpose_a( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<128x64xf32>, %[[B:[0-9a-z]+]]: tensor<128x64xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<128x64xf32> -> tensor<2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<128x64xf32> -> tensor<4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> @@ -457,22 +457,22 @@ func.func @block_generic_matmul_transpose_b( // CHECK-LABEL: func @block_generic_matmul_transpose_b( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<64x128xf32>, %[[B:[0-9a-z]+]]: tensor<64x128xf32>, %[[C:[0-9a-z]+]]: tensor<64x64xf32> // CHECK: %[[PACK_DST_0:.+]] = tensor.empty() : tensor<2x2x32x64xf32> -// CHECK: %[[A_PACKED:.+]] = tensor.pack %[[A]] +// CHECK: %[[A_PACKED:.+]] = linalg.pack %[[A]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [32, 64] // CHECK-SAME: into %[[PACK_DST_0]] : tensor<64x128xf32> -> tensor<2x2x32x64xf32> // CHECK: %[[PACK_DST_1:.+]] = tensor.empty() : tensor<4x2x16x64xf32> -// CHECK: %[[B_PACKED:.+]] = tensor.pack %[[B]] +// CHECK: %[[B_PACKED:.+]] = linalg.pack %[[B]] // CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 64] // CHECK-SAME: into %[[PACK_DST_1]] : tensor<64x128xf32> -> tensor<4x2x16x64xf32> // CHECK: %[[PACK_DST_2:.+]] = tensor.empty() : tensor<2x4x32x16xf32> -// CHECK: %[[C_PACKED:.+]] = tensor.pack %[[C]] +// CHECK: %[[C_PACKED:.+]] = linalg.pack %[[C]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[PACK_DST_2]] : tensor<64x64xf32> -> tensor<2x4x32x16xf32> // CHECK: %[[GEMM_RES_PACKED:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[A_PACKED]], %[[B_PACKED]] : tensor<2x2x32x64xf32>, tensor<4x2x16x64xf32>) outs(%[[C_PACKED]] : tensor<2x4x32x16xf32>) -// CHECK: %[[RES_UNPACKED:.+]] = tensor.unpack %[[GEMM_RES_PACKED]] +// CHECK: %[[RES_UNPACKED:.+]] = linalg.unpack %[[GEMM_RES_PACKED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32> // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32> @@ -498,10 +498,10 @@ func.func @non_contraction_generic( // CHECK-LABEL: func @non_contraction_generic( // CHECK-SAME: %[[A:[0-9a-z]+]]: tensor<64x128xf32> // CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK-NOT: tensor.pack +// CHECK-NOT: linalg.pack // CHECK: %[[GENERIC:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]]] // CHECK-SAME: iterator_types = ["parallel", "parallel"] // CHECK-SAME: outs(%[[A]] : tensor<64x128xf32>) -// CHECK-NOT: tensor.unpack +// CHECK-NOT: linalg.unpack // CHECK: return %[[GENERIC]] : tensor<64x128xf32> diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir index cd439cd23ecd0..db4f6181f517c 100644 --- a/mlir/test/Dialect/Linalg/canonicalize.mlir +++ b/mlir/test/Dialect/Linalg/canonicalize.mlir @@ -357,7 +357,7 @@ func.func @fill_pack() -> tensor<24x32x16x16xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<24x32x16x16xf32> %1 = linalg.fill ins(%cst : f32) outs(%dest : tensor<384x512xf32>) -> tensor<384x512xf32> - %pack = tensor.pack %1 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %0 : tensor<384x512xf32> -> tensor<24x32x16x16xf32> + %pack = linalg.pack %1 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %0 : tensor<384x512xf32> -> tensor<24x32x16x16xf32> return %pack : tensor<24x32x16x16xf32> } // CHECK-LABEL: func.func @fill_pack @@ -374,7 +374,7 @@ func.func @fill_pack_general() -> tensor<1x1x8x4x4x8xi32>{ %extracted_slice_15 = tensor.extract_slice %9[0, 0, 0, 0] [1, 1, 16, 64] [1, 1, 1, 1] : tensor<1x1x16x64xi32> to tensor<1x1x16x64xi32> %16 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_15 : tensor<1x1x16x64xi32>) -> tensor<1x1x16x64xi32> %0 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x4x8xi32> to tensor<1x1x8x4x4x8xi32> - %pack_18 = tensor.pack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x1x16x64xi32> -> tensor<1x1x8x4x4x8xi32> + %pack_18 = linalg.pack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x1x16x64xi32> -> tensor<1x1x8x4x4x8xi32> return %pack_18 : tensor<1x1x8x4x4x8xi32> } @@ -397,7 +397,7 @@ func.func @dynamic_fill_pack(%arg0: tensor) -> tensor { %1 = affine.apply #map()[%dim] %2 = affine.apply #map()[%dim_0] %3 = tensor.empty(%1, %2) : tensor - %pack = tensor.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %3 : tensor -> tensor + %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %3 : tensor -> tensor return %pack : tensor } // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1249,3 +1249,499 @@ func.func @recursive_effect(%arg : tensor<1xf32>) { // CHECK-LABEL: @recursive_effect // CHECK: linalg.map + +//===----------------------------------------------------------------------===// +// linalg.pack +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func @fold_pack_constant_splat +// CHECK-NOT: linalg.pack +// CHECK: arith.constant dense<1.000000e-01> : tensor<8x16x8x32xf32> +func.func @fold_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { + %cst = arith.constant dense<1.000000e-01> : tensor<64x128xf32> + %0 = linalg.pack %cst outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest : tensor<64x128xf32> -> tensor<8x16x8x32xf32> + return %0 : tensor<8x16x8x32xf32> +} + +// ----- + +// CHECK-LABEL: func @fold_padding_value_pack_constant_splat +// CHECK-NOT: linalg.pack +// CHECK: arith.constant dense<1.000000e-01> : tensor<8x16x8x32xf32> +func.func @fold_padding_value_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { + %pad = arith.constant 1.000000e-01 : f32 + %cst = arith.constant dense<1.000000e-01> : tensor<63x127xf32> + %0 = linalg.pack %cst + padding_value(%pad : f32) + outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest : tensor<63x127xf32> -> tensor<8x16x8x32xf32> + return %0 : tensor<8x16x8x32xf32> +} + + +// ----- + +// CHECK-LABEL: func @nofold_padding_value_pack_constant_splat +// CHECK: arith.constant dense<1.000000e-01> : tensor<63x127xf32> +// CHECK: linalg.pack +func.func @nofold_padding_value_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { + %pad = arith.constant 0.0 : f32 + %cst = arith.constant dense<1.000000e-01> : tensor<63x127xf32> + %0 = linalg.pack %cst + padding_value(%pad : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [0, 1] + inner_tiles = [8, 32] + into %dest : tensor<63x127xf32> -> tensor<8x16x8x32xf32> + return %0 : tensor<8x16x8x32xf32> +} + +// ----- + +func.func @fold_padding_value_pack(%arg0: tensor<1200x500000xf32>) -> tensor<31250x1200x16x1xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<31250x1200x16x1xf32> + %pack = linalg.pack %arg0 + padding_value(%cst : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [1, 0] + inner_tiles = [16, 1] + into %0 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32> + return %pack : tensor<31250x1200x16x1xf32> +} +// CHECK-LABEL: func @fold_padding_value_pack +// CHECK-NOT: padding_value + +// ----- + +func.func @infer_src_shape_pack(%src: tensor, %dest: tensor<10x20x30x40x16xf32>) -> tensor<10x20x30x40x16xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %src + padding_value(%cst : f32) + outer_dims_perm = [2, 1, 3, 0] + inner_dims_pos = [2] + inner_tiles = [16] + into %dest : tensor -> tensor<10x20x30x40x16xf32> + return %pack : tensor<10x20x30x40x16xf32> +} +// CHECK-LABEL: func.func @infer_src_shape_pack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] +// CHECK: %[[CAST_SRC:.+]] = tensor.cast %[[SRC]] : tensor to tensor<40x20x?x30xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[CAST_SRC]] {{.+}} into %[[DEST]] +// CHECK: return %[[PACK]] + +// ----- + +func.func @infer_dest_shape_pack(%src: tensor<30x20x?x10xf32>, %dest: tensor) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %src + padding_value(%cst : f32) + outer_dims_perm = [2, 1, 3, 0] + inner_dims_pos = [2] + inner_tiles = [16] + into %dest : tensor<30x20x?x10xf32> -> tensor + return %pack : tensor +} +// CHECK-LABEL: func.func @infer_dest_shape_pack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] +// CHECK: %[[CAST_DEST:.+]] = tensor.cast %[[DEST]] : tensor to tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[SRC]] {{.+}} into %[[CAST_DEST]] +// CHECK: %[[CAST_PACK:.+]] = tensor.cast %[[PACK]] : tensor to tensor +// CHECK: return %[[CAST_PACK]] + +// ----- + +func.func @no_infer_pack_shape(%arg0: tensor, %arg1: index) -> tensor<32x7x?x16x1xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty(%arg1) : tensor<32x7x?x16x1xf32> + %pack = linalg.pack %arg0 padding_value(%cst : f32) outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %0 : tensor -> tensor<32x7x?x16x1xf32> + return %pack : tensor<32x7x?x16x1xf32> +} +// CHECK-LABEL: func.func @no_infer_pack_shape +// CHECK-NOT: tensor.cast + +// ----- + +func.func @fold_padding_value_pack_negative1(%arg0: tensor<1200x499999xf32>) -> tensor<31250x1200x16x1xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<31250x1200x16x1xf32> + %pack = linalg.pack %arg0 + padding_value(%cst : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [1, 0] + inner_tiles = [16, 1] + into %0 : tensor<1200x499999xf32> -> tensor<31250x1200x16x1xf32> + return %pack : tensor<31250x1200x16x1xf32> +} +// CHECK-LABEL: func @fold_padding_value_pack_negative1 +// CHECK: linalg.pack +// CHECK-SAME: padding_value + +// ----- + +func.func @fold_padding_value_pack_negative2(%arg0: tensor<1200x?xf32>, %arg1: tensor) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %arg0 + padding_value(%cst : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [1, 0] + inner_tiles = [16, 1] + into %arg1 : tensor<1200x?xf32> -> tensor + return %pack : tensor +} +// CHECK-LABEL: func @fold_padding_value_pack_negative2 +// CHECK: linalg.pack +// CHECK-SAME: padding_value + +// ----- + +func.func @fold_padding_value_pack_negative3(%arg0: tensor<1200x500000xf32>, %arg1: tensor, %tile : index) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %arg0 + padding_value(%cst : f32) + outer_dims_perm = [1, 0] + inner_dims_pos = [1, 0] + inner_tiles = [%tile, 1] + into %arg1 : tensor<1200x500000xf32> -> tensor + return %pack : tensor +} +// CHECK-LABEL: func @fold_padding_value_pack_negative3 +// CHECK: linalg.pack +// CHECK-SAME: padding_value + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.unpack +//===----------------------------------------------------------------------===// + + +// CHECK-LABEL: func @fold_unpack_constant_splat +// CHECK-NOT: linalg.unpack +// CHECK: arith.constant dense<1.000000e-01> : tensor<128x256xf32> +func.func @fold_unpack_constant_splat(%dest : tensor<128x256xf32>) -> tensor<128x256xf32> { + %cst = arith.constant dense<1.000000e-01> : tensor<16x8x8x32xf32> + %0 = linalg.unpack %cst inner_dims_pos = [0, 1] + inner_tiles = [8, 32] into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32> + return %0 : tensor<128x256xf32> +} + +// ----- + +func.func @infer_dest_shape_unpack(%src: tensor<10x20x30x40x16xf32>, %dest: tensor) -> tensor { + %unpack = linalg.unpack %src + outer_dims_perm = [2, 1, 3, 0] + inner_dims_pos = [2] + inner_tiles = [16] + into %dest : tensor<10x20x30x40x16xf32> -> tensor + return %unpack : tensor +} +// CHECK-LABEL: func.func @infer_dest_shape_unpack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] +// CHECK: %[[CAST_DEST:.+]] = tensor.cast %[[DEST]] : tensor to tensor<40x20x?x30xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SRC]] {{.+}} into %[[CAST_DEST]] +// CHECK: %[[CAST_UNPACK:.+]] = tensor.cast %[[UNPACK]] : tensor<40x20x?x30xf32> to tensor +// CHECK: return %[[CAST_UNPACK]] + +// ----- + +func.func @infer_src_shape_unpack(%src: tensor, %dest: tensor<30x20x?x10xf32>) -> tensor<30x20x?x10xf32> { + %unpack = linalg.unpack %src + outer_dims_perm = [2, 1, 3, 0] + inner_dims_pos = [2] + inner_tiles = [16] + into %dest : tensor -> tensor<30x20x?x10xf32> + return %unpack : tensor<30x20x?x10xf32> +} +// CHECK-LABEL: func.func @infer_src_shape_unpack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] +// CHECK: %[[CAST_SRC:.+]] = tensor.cast %[[SRC]] : tensor to tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[CAST_SRC]] +// CHECK: return %[[UNPACK]] + +// ----- + +func.func @no_infer_unpack_shape(%arg1: tensor<32x7x?x16x1xf32>, %arg2: index) -> tensor { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty(%arg2) : tensor + %unpack = linalg.unpack %arg1 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %0 : tensor<32x7x?x16x1xf32> -> tensor + return %unpack : tensor +} +// CHECK-LABEL: func.func @no_infer_unpack_shape +// CHECK-NOT: tensor.cast + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.pack + linalg.unpack +//===----------------------------------------------------------------------===// + +// Chain: NC -> NCnc -> NCnc -> NC +// CHECK: func.func @unpack_pack( +// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) +// CHECK: return %[[T]] : tensor<128x128xf32> +func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { + %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> + %packed = linalg.pack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + %tensor_empty1 = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor<128x128xf32> + return %unpacked : tensor<128x128xf32> +} + +// ----- + +// Chain: NC -> NCcn -> NCnc -> NC +// CHECK: func.func @unpack_pack( +// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) +// CHECK-NOT: return %[[T]] : tensor<128x128xf32> +func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { + %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> + %packed = linalg.pack %t inner_dims_pos = [1, 0] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + %tensor_empty1 = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor +<128x128xf32> + return %unpacked : tensor<128x128xf32> +} + +// ----- + +// Chain: NC -> CNcn -> NCnc -> NC +// CHECK: func.func @unpack_pack( +// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) +// CHECK-NOT: return %[[T]] : tensor<128x128xf32> +func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { + %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> + %packed = linalg.pack %t outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + %tensor_empty1 = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor +<128x128xf32> + return %unpacked : tensor<128x128xf32> +} + +// ----- + +// Chain: NC -> NCnc -> NCnc -> NC +// CHECK: func.func @unpack_pack( +// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>, +// CHECK: return %[[T]] : tensor<128x128xf32> +func.func @unpack_pack(%t: tensor<128x128xf32>, %tile1: index, %tile2: index) -> tensor<128x128xf32> { + %tensor_empty = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> + %packed = linalg.pack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x?x?xf32> + %tensor_empty1 = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<16x16x?x?xf32> -> tensor +<128x128xf32> + return %unpacked : tensor<128x128xf32> +} + +// ----- + +// CHECK: func.func @unpack_pack_with_padding_no_canonicalization( +// CHECK: linalg.pack +// CHECK: linalg.unpack +func.func @unpack_pack_with_padding_no_canonicalization(%t: tensor<256x512xbf16>) -> tensor<224x512xbf16> { + %tensor_empty = tensor.empty() : tensor<4x16x64x32xbf16> + %tensor_empty1 = tensor.empty() : tensor<224x512xbf16> + %packed = linalg.pack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty : tensor<256x512xbf16> -> tensor<4x16x64x32xbf16> + %unpacked = linalg.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty1 : tensor<4x16x64x32xbf16> -> tensor<224x512xbf16> + return %unpacked : tensor<224x512xbf16> +} + +// ----- + +// Chain NCnc -> NC -> NC -> NCnc +// CHECK: func.func @pack_unpack( +// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, +// CHECK: return %[[T]] : tensor<16x16x?x?xf32> +func.func @pack_unpack(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { + %tensor_empty = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> + %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> + return %packed : tensor<16x16x?x?xf32> +} + +// ----- + +// Chain NCnc -> NC -> NC -> NCnc +// CHECK: func.func @pack_unpack( +// CHECK-SAME: %[[T:.+]]: tensor<16x16x8x8xf32> +// CHECK: return %[[T]] : tensor<16x16x8x8xf32> +func.func @pack_unpack(%t: tensor<16x16x8x8xf32>) -> tensor<16x16x8x8xf32> { + %tensor_empty = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<16x16x8x8xf32> -> tensor<128x128xf32> + %tensor_empty1 = tensor.empty() : tensor<16x16x8x8xf32> + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + return %packed : tensor<16x16x8x8xf32> +} + +// ----- + +// CHECK: func.func @pack_unpack_same_tiles( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK: return %[[T]] : tensor +func.func @pack_unpack_same_tiles(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, + %tile1: index, %tile2: index) -> tensor { + %tensor_empty = tensor.empty(%dim1, %dim2) : tensor + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor + %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor -> tensor + return %packed : tensor +} + +// ----- + +// CHECK: func.func @pack_unpack_different_tiles( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK-NOT: return %[[T]] : tensor +func.func @pack_unpack_different_tiles(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, + %tile1: index, %tile2: index) -> tensor { + %tensor_empty = tensor.empty(%dim1, %dim2) : tensor + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor + %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile2, %tile1] into %tensor_empty1 : tensor -> tensor + return %packed : tensor +} + +// ----- + +// CHECK: func.func @pack_unpack_dynamic_with_padding( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK-NOT: return %[[T]] : tensor +func.func @pack_unpack_dynamic_with_padding(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, + %tile1: index, %tile2: index, %pad: f32) -> tensor { + %tensor_empty = tensor.empty(%dim1, %dim2) : tensor + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor + %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor + %packed = linalg.pack %unpacked padding_value(%pad: f32) inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor -> tensor + return %packed : tensor +} + +// ----- + +// CHECK: func.func @pack_outer_dims_unpack_no_outer_dims( +// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, +// CHECK: return %[[T]] : tensor<16x16x?x?xf32> +func.func @pack_outer_dims_unpack_no_outer_dims(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { + %tensor_empty = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> + %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> + %packed = linalg.pack %unpacked outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> + return %packed : tensor<16x16x?x?xf32> +} + +// ----- + +// CHECK: func.func @pack_no_outer_dims_unpack_outer_dims( +// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, +// CHECK: return %[[T]] : tensor<16x16x?x?xf32> +func.func @pack_no_outer_dims_unpack_outer_dims(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { + %tensor_empty = tensor.empty() : tensor<128x128xf32> + %unpacked = linalg.unpack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> + %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> + %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> + return %packed : tensor<16x16x?x?xf32> +} + +// ----- + +//===----------------------------------------------------------------------===// +// tensor.cast + linalg.pack +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func.func @fold_cast_pack_dynamic_tile_size +// CHECK-SAME: %[[DEST:.*]]: tensor<1x1x8x1xi32>, +// CHECK-SAME: %[[SRC:.*]]: tensor<7x?xi32>, +// CHECK-SAME: %[[PAD:.*]]: i32) -> tensor<1x1x8x1xi32> { +// CHECK: %[[PACK:.*]] = linalg.pack %[[SRC]] padding_value(%[[PAD]] : i32) +// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] +// CHECK-SAME: test_attr +// CHECK-SAME: : tensor<7x?xi32> -> tensor<1x1x8x1xi32> +// CHECK: return %[[PACK]] : tensor<1x1x8x1xi32> +func.func @fold_cast_pack_dynamic_tile_size( + %dest: tensor<1x1x8x1xi32>, + %src: tensor<7x?xi32>, + %pad: i32) -> tensor<1x1x8x1xi32> { + + %cast = tensor.cast %dest : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> + %c8 = arith.constant 8 : index + %pack = linalg.pack %src padding_value(%pad : i32) + inner_dims_pos = [0, 1] + inner_tiles = [%c8, 1] + into %cast {test_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32> + %res = tensor.cast %pack : tensor<1x1x?x1xi32> to tensor<1x1x8x1xi32> + return %res : tensor<1x1x8x1xi32> +} + +// ----- + +func.func @infer_and_fold_pack_unpack_same_tiles(%t: tensor<10x20x4x4xf32>) -> tensor<10x20x4x4xf32> { + %dim1 = arith.constant 40 : index + %dim2 = arith.constant 80 : index + %tensor_empty = tensor.empty(%dim1, %dim2) : tensor + %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %tensor_empty : tensor<10x20x4x4xf32> -> tensor + %cast = tensor.cast %unpacked : tensor to tensor<40x80xf32> + %tensor_empty1 = tensor.empty() : tensor<10x20x4x4xf32> + %packed = linalg.pack %cast inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %tensor_empty1 : tensor<40x80xf32> -> tensor<10x20x4x4xf32> + return %packed : tensor<10x20x4x4xf32> +} +// CHECK-LABEL: func.func @infer_and_fold_pack_unpack_same_tiles +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK: return %[[SRC]] + +// ----- + +// CHECK-LABEL: func.func @pack_dont_drop_attributes( +// CHECK: linalg.pack {{.*}} {test_attr} +func.func @pack_dont_drop_attributes(%arg0: tensor, %arg1: tensor<128x?x100x16x1xf16>) -> tensor<128x?x100x16x1xf16> { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f16 + %pack = linalg.pack %arg0 padding_value(%cst : f16) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %arg1 {test_attr} : tensor -> tensor<128x?x100x16x1xf16> + return %pack : tensor<128x?x100x16x1xf16> +} +// ----- + +//===----------------------------------------------------------------------===// +// linalg.fill + linalg.unpack +//===----------------------------------------------------------------------===// +// Fold DstStyleOp -> tensor.unpack operations. +func.func @fold_dst_style_ops_into_unpack(%arg0 : tensor, %init : tensor) -> tensor { + %cst = arith.constant 0.0 : f32 + %fill = linalg.fill ins(%cst : f32) outs(%init : tensor) -> tensor + %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %fill : tensor -> tensor + return %unpack : tensor +} +// CHECK-LABEL: func @fold_dst_style_ops_into_unpack +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-SAME: %[[INIT:.+]]: tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] +// CHECK-SAME: into %[[INIT]] +// CHECK: return %[[UNPACK]] + +// ----- + +//===----------------------------------------------------------------------===// +// tensor.cast + linalg.unpack +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func.func @fold_cast_unpack_dynamic_tile_size( +// CHECK-SAME: %[[SRC:.*]]: tensor<1x1x8x1xi32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<7x?xi32>) -> tensor<7x?xi32> { +// CHECK: %[[RES:.*]] = linalg.unpack %[[SRC]] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] {test_attr} : tensor<1x1x8x1xi32> -> tensor<7x?xi32> +// CHECK: return %[[RES]] : tensor<7x?xi32> +func.func @fold_cast_unpack_dynamic_tile_size( + %src: tensor<1x1x8x1xi32>, + %res: tensor<7x?xi32>) -> tensor<7x?xi32> { + + %cast = tensor.cast %src : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> + %c8 = arith.constant 8 : index + %unpack = linalg.unpack %cast + inner_dims_pos = [0, 1] + inner_tiles = [%c8, 1] + into %res {test_attr} : tensor<1x1x?x1xi32> -> tensor<7x?xi32> + return %unpack : tensor<7x?xi32> +} diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir index b2b29b2b2fee2..19d4524a2ec06 100644 --- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir +++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir @@ -15,7 +15,7 @@ func.func @dynamic_elem_pack(%arg0: tensor, %dest: tensor) %4 = arith.addf %arg3, %arg3 : f32 linalg.yield %4 : f32 } -> tensor - %4 = tensor.pack %3 + %4 = linalg.pack %3 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor @@ -34,7 +34,7 @@ func.func @dynamic_elem_pack(%arg0: tensor, %dest: tensor) // CHECK-DAG: %[[OUTER_D0:.+]] = affine.apply #[[$MAP0]]()[%[[D0]]] // CHECK-DAG: %[[OUTER_D1:.+]] = affine.apply #[[$MAP1]]()[%[[D1]]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty(%[[OUTER_D0]], %[[OUTER_D1]]) : tensor -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 2] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ELEM:.+]] = linalg.generic @@ -62,7 +62,7 @@ func.func @dynamic_elem_pack_padding_value(%arg0: tensor, %dest: tensor %4 = arith.addf %arg3, %arg3 : f32 linalg.yield %4 : f32 } -> tensor - %4 = tensor.pack %3 padding_value(%cst : f32) + %4 = linalg.pack %3 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor @@ -70,7 +70,7 @@ func.func @dynamic_elem_pack_padding_value(%arg0: tensor, %dest: tensor } // CHECK-LABEL: func.func @dynamic_elem_pack_padding_value // CHECK: %[[GENERIC:.+]] = linalg.generic -// CHECK: tensor.pack %[[GENERIC]] +// CHECK: linalg.pack %[[GENERIC]] // ----- @@ -84,7 +84,7 @@ func.func @elem_pack_transpose_inner_dims(%arg0: tensor<128x256xi32>, %dest: ten %4 = arith.addi %arg3, %arg3 : i32 linalg.yield %4 : i32 } -> tensor<128x256xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %dest : tensor<128x256xi32> -> tensor<4x16x16x32xi32> @@ -95,7 +95,7 @@ func.func @elem_pack_transpose_inner_dims(%arg0: tensor<128x256xi32>, %dest: ten // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x16x32xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ELEM:.+]] = linalg.generic @@ -117,7 +117,7 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %dest: ten %4 = arith.addi %arg3, %arg3 : i32 linalg.yield %4 : i32 } -> tensor<128x256xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] @@ -129,7 +129,7 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %dest: ten // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<16x4x32x16xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[ARG0_EMPTY]] : tensor<128x256xi32> -> tensor<16x4x32x16xi32> // CHECK: %[[ELEM:.+]] = linalg.generic @@ -151,7 +151,7 @@ func.func @elem_pack_transpose_inner_and_outer_dims(%arg0: tensor<128x256xi32>, %4 = arith.addi %arg3, %arg3 : i32 linalg.yield %4 : i32 } -> tensor<128x256xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 32] @@ -163,7 +163,7 @@ func.func @elem_pack_transpose_inner_and_outer_dims(%arg0: tensor<128x256xi32>, // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<16x4x16x32xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ELEM:.+]] = linalg.generic @@ -191,7 +191,7 @@ func.func @dynamic_broadcast_pack(%arg0: tensor, %arg1: tensor, %d %4 = arith.addf %arg3, %arg4 : f32 linalg.yield %4 : f32 } -> tensor - %4 = tensor.pack %3 + %4 = linalg.pack %3 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor @@ -210,13 +210,13 @@ func.func @dynamic_broadcast_pack(%arg0: tensor, %arg1: tensor, %d // CHECK-DAG: %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]] // CHECK-DAG: %[[OUTER_D0:.+]] = affine.apply #[[$MAP0]]()[%[[D0]]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty(%[[OUTER_D0]]) : tensor -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [8] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK-DAG: %[[D1:.+]] = tensor.dim %[[ARG1]], %[[C0]] // CHECK-DAG: %[[OUTER_D1:.+]] = affine.apply #[[$MAP1]]()[%[[D1]]] // CHECK: %[[ARG1_EMPTY:.+]] = tensor.empty(%[[OUTER_D1]]) : tensor -// CHECK: %[[PACK_ARG1:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[PACK_ARG1:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [2] // CHECK-SAME: into %[[ARG1_EMPTY]] // CHECK: %[[ELEM:.+]] = linalg.generic @@ -240,7 +240,7 @@ func.func @elem_pack_transpose_inner_and_outer_dims2(%arg0: tensor<64xf32>, %des ^bb0(%in: f32, %out: f32): linalg.yield %in : f32 } -> tensor<1x56x57x64xf32> - %2 = tensor.pack %1 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %dest : tensor<1x56x57x64xf32> -> tensor<1x2x56x57x32xf32> + %2 = linalg.pack %1 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %dest : tensor<1x56x57x64xf32> -> tensor<1x2x56x57x32xf32> return %2 : tensor<1x2x56x57x32xf32> } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d1, d4)> @@ -249,7 +249,7 @@ func.func @elem_pack_transpose_inner_and_outer_dims2(%arg0: tensor<64xf32>, %des // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<2x32xf32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -275,7 +275,7 @@ func.func @transpose_pack(%arg0: tensor<100x128x200x256xi32>, %arg1: tensor<100x %1 = arith.addi %0, %b2 : i32 linalg.yield %1 : i32 } -> tensor<100x200x128x256xi32> - %4 = tensor.pack %transpose + %4 = linalg.pack %transpose inner_dims_pos = [3, 2] inner_tiles = [16, 32] into %dest : tensor<100x200x128x256xi32> -> tensor<100x200x4x16x16x32xi32> @@ -291,11 +291,11 @@ func.func @transpose_pack(%arg0: tensor<100x128x200x256xi32>, %arg1: tensor<100x // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<100x4x200x16x16x32xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [3, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ARG2_EMPTY:.+]] = tensor.empty() : tensor<4x32xi32> -// CHECK: %[[PACKED_ARG2:.+]] = tensor.pack %[[ARG2]] +// CHECK: %[[PACKED_ARG2:.+]] = linalg.pack %[[ARG2]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[ARG2_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -321,7 +321,7 @@ func.func @affine_constant_expr_pack(%arg0: tensor<100x128x200x256xi32>, %arg1: %1 = arith.addi %0, %b2 : i32 linalg.yield %1 : i32 } -> tensor<100x200x128x256xi32> - %4 = tensor.pack %transpose + %4 = linalg.pack %transpose inner_dims_pos = [3, 2] inner_tiles = [16, 32] into %dest : tensor<100x200x128x256xi32> -> tensor<100x200x4x16x16x32xi32> @@ -337,11 +337,11 @@ func.func @affine_constant_expr_pack(%arg0: tensor<100x128x200x256xi32>, %arg1: // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<100x4x200x16x16x32xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [3, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ARG2_EMPTY:.+]] = tensor.empty() : tensor<1x4x1x1x32xi32> -// CHECK: %[[PACKED_ARG2:.+]] = tensor.pack %[[ARG2]] +// CHECK: %[[PACKED_ARG2:.+]] = linalg.pack %[[ARG2]] // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[ARG2_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -367,7 +367,7 @@ func.func @transpose_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %a %1 = arith.addi %0, %b2 : i32 linalg.yield %1 : i32 } -> tensor<100x200x128x256xi32> - %4 = tensor.pack %transpose + %4 = linalg.pack %transpose outer_dims_perm = [1, 2, 3, 0] inner_dims_pos = [3, 2] inner_tiles = [16, 32] @@ -384,11 +384,11 @@ func.func @transpose_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %a // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] // CHECK-SAME: %[[DEST:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<200x4x16x100x16x32xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 3, 0] inner_dims_pos = [3, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ARG2_EMPTY:.+]] = tensor.empty() : tensor<4x32xi32> -// CHECK: %[[PACKED_ARG2:.+]] = tensor.pack %[[ARG2]] +// CHECK: %[[PACKED_ARG2:.+]] = linalg.pack %[[ARG2]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[ARG2_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -408,7 +408,7 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %init: ten linalg.yield %4 : i32 } -> tensor<128x256xi32> %empty = tensor.empty() : tensor<16x4x32x16xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] @@ -421,11 +421,11 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %init: ten // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK: %[[ARG1_EMPTY:.+]] = tensor.empty() : tensor<16x4x32x16xi32> -// CHECK: %[[PACKED_ARG1:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[PACKED_ARG1:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[ARG1_EMPTY]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<16x4x32x16xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -439,7 +439,7 @@ func.func @elem_pack_transpose_outer_dims(%arg0: tensor<128x256xi32>, %init: ten func.func @unpack_on_output(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56x64xf32> { %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%1 : tensor<12x56x56x64xf32>) { ^bb0(%out: f32): %3 = arith.addf %out, %out : f32 @@ -452,17 +452,17 @@ func.func @unpack_on_output(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56 // CHECK-LABEL: func.func @unpack_on_output // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_EMPTY_UNPACK:.+]] = tensor.empty() : tensor<12x56x56x64xf32> -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_EMPTY_UNPACK]] // CHECK: %[[ARG0_EMPTY_PACK:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_EMPTY_PACK]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]]] // CHECK-SAME: outs(%[[PACKED_ARG0]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[UNPACKED_ARG0]] @@ -472,7 +472,7 @@ func.func @unpack_on_output(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56 func.func @unpack_on_input(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56x56x64xf32>) -> tensor<12x56x56x64xf32> { %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1: tensor<12x56x56x64xf32>) outs(%init : tensor<12x56x56x64xf32>) { ^bb0(%in: f32, %out: f32): %3 = arith.addf %in, %out : f32 @@ -486,22 +486,22 @@ func.func @unpack_on_input(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32> -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_UNPACK_EMPTY]] // CHECK: %[[ARG1_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[ARG1_PACK:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1_PACK_EMPTY]] // CHECK: %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[ARG0_PACK:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[ARG0_PACK:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_PACK_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]]] // CHECK-SAME: ins(%[[ARG0_PACK]] // CHECK-SAME: outs(%[[ARG1_PACK]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1]] @@ -511,7 +511,7 @@ func.func @unpack_on_input(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56 func.func @unpack_element_type_change(%arg0: tensor<12x2x56x56x32xf32>, %init: tensor<12x56x56x64xf16>) -> tensor<12x56x56x64xf16> { %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1: tensor<12x56x56x64xf32>) outs(%init : tensor<12x56x56x64xf16>) { ^bb0(%in: f32, %out: f16): %3 = arith.truncf %in : f32 to f16 @@ -525,22 +525,22 @@ func.func @unpack_element_type_change(%arg0: tensor<12x2x56x56x32xf32>, %init: t // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK: %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32> -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_UNPACK_EMPTY]] // CHECK: %[[ARG1_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf16> -// CHECK: %[[ARG1_PACK:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[ARG1_PACK:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1_PACK_EMPTY]] // CHECK: %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[ARG0_PACK:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[ARG0_PACK:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_PACK_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]]] // CHECK-SAME: ins(%[[ARG0_PACK]] // CHECK-SAME: outs(%[[ARG1_PACK]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG1]] @@ -551,7 +551,7 @@ func.func @unpack_element_type_change(%arg0: tensor<12x2x56x56x32xf32>, %init: t func.func @forward_tensor_empty(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x56x56x64xf32> { %init = tensor.empty() : tensor<12x56x56x64xf32> %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<12x2x56x56x32xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1: tensor<12x56x56x64xf32>) outs(%init : tensor<12x56x56x64xf32>) { ^bb0(%in: f32, %out: f32): %3 = arith.addf %in, %in : f32 @@ -565,19 +565,19 @@ func.func @forward_tensor_empty(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x5 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[FINAL_RES:.+]] = tensor.empty() : tensor<12x56x56x64xf32> // CHECK: %[[ARG0_UNPACK_EMPTY:.+]] = tensor.empty() : tensor<12x56x56x64xf32> -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_UNPACK_EMPTY]] // CHECK: %[[DEST:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> // CHECK: %[[ARG0_PACK_EMPTY:.+]] = tensor.empty() : tensor<12x2x56x56x32xf32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[ARG0_PACK_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP]]] // CHECK-SAME: ins(%[[PACKED_ARG0]] // CHECK-SAME: outs(%[[DEST]] -// CHECK: %[[UNPACKED:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACKED:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[FINAL_RES]] @@ -586,7 +586,7 @@ func.func @forward_tensor_empty(%arg0: tensor<12x2x56x56x32xf32>) -> tensor<12x5 func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tensor<1x58x58x64xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<1x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> %padded = tensor.pad %1 low[0, 1, 1, 0] high[0, 1, 1, 0] { ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): tensor.yield %cst : f32 @@ -599,7 +599,7 @@ func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tens // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[PADDED:.+]] = tensor.pad %[[ARG0]] low[0, 0, 1, 1, 0] high[0, 0, 1, 1, 0] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x58x58x64xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[PADDED]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[PADDED]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x2x58x58x32xf32> -> tensor<1x58x58x64xf32> @@ -608,7 +608,7 @@ func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tens func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tensor<2x58x58x64xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<1x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> %padded = tensor.pad %1 low[1, 1, 1, 0] high[0, 1, 1, 0] { ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): tensor.yield %cst : f32 @@ -621,7 +621,7 @@ func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tens // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[PADDED:.+]] = tensor.pad %[[ARG0]] low[1, 0, 1, 1, 0] high[0, 0, 1, 1, 0] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2x58x58x64xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[PADDED]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[PADDED]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<2x2x58x58x32xf32> -> tensor<2x58x58x64xf32> @@ -630,7 +630,7 @@ func.func @pad_valid_unpack_propagation(%arg0: tensor<1x2x56x56x32xf32>) -> tens func.func @pad_along_unpacked_dim(%arg0: tensor<1x2x56x56x32xf32>) -> tensor<1x58x58x66xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<1x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> %padded = tensor.pad %1 low[0, 1, 1, 1] high[0, 1, 1, 1] { ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index): tensor.yield %cst : f32 @@ -642,7 +642,7 @@ func.func @pad_along_unpacked_dim(%arg0: tensor<1x2x56x56x32xf32>) -> tensor<1x5 // CHECK: %[[ARG0:.+]]: tensor<1x2x56x56x32xf32>) // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x56x56x64xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x2x56x56x32xf32> -> tensor<1x56x56x64xf32> // CHECK: %[[PADDED:.+]] = tensor.pad %[[UNPACK]] low[0, 1, 1, 1] high[0, 1, 1, 1] @@ -656,7 +656,7 @@ func.func @pad_valid_pack_propagation(%arg0: tensor<1x64x56x56xf32>) -> tensor<1 tensor.yield %cst : f32 } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32> %0 = tensor.empty() : tensor<1x2x58x58x32xf32> - %1 = tensor.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> + %1 = linalg.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> return %1 : tensor<1x2x58x58x32xf32> } @@ -664,7 +664,7 @@ func.func @pad_valid_pack_propagation(%arg0: tensor<1x64x56x56xf32>) -> tensor<1 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x64x56x56xf32>) // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x56x56x32xf32> -// CHECK: %[[PACKED:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [32] +// CHECK: %[[PACKED:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x64x56x56xf32> -> tensor<1x2x56x56x32xf32> // CHECK: %[[PADDED:.+]] = tensor.pad %[[PACKED]] low[0, 0, 1, 1, 0] high[0, 0, 1, 1, 0] // CHECK: return %[[PADDED]] @@ -678,7 +678,7 @@ func.func @pad_valid_outer_dims_pack_propagation(%arg0: tensor<1x64x56x56xf32>) tensor.yield %cst : f32 } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32> %0 = tensor.empty() : tensor<1x58x58x2x32xf32> - %1 = tensor.pack %padded outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x58x58x2x32xf32> + %1 = linalg.pack %padded outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x58x58x2x32xf32> return %1 : tensor<1x58x58x2x32xf32> } @@ -686,7 +686,7 @@ func.func @pad_valid_outer_dims_pack_propagation(%arg0: tensor<1x64x56x56xf32>) // CHECK-SAME: %[[ARG0:.+]]: tensor<1x64x56x56xf32>) // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x56x56x2x32xf32> -// CHECK: %[[PACKED:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x64x56x56xf32> -> tensor<1x56x56x2x32xf32> // CHECK: %[[PADDED:.+]] = tensor.pad %[[PACKED]] low[0, 1, 1, 0, 0] high[0, 1, 1, 0, 0] @@ -701,7 +701,7 @@ func.func @pad_along_packed_dim(%arg0: tensor<1x60x56x56xf32>) -> tensor<1x2x58x tensor.yield %cst : f32 } : tensor<1x60x56x56xf32> to tensor<1x64x58x58xf32> %0 = tensor.empty() : tensor<1x2x58x58x32xf32> - %1 = tensor.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> + %1 = linalg.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> return %1 : tensor<1x2x58x58x32xf32> } @@ -710,7 +710,7 @@ func.func @pad_along_packed_dim(%arg0: tensor<1x60x56x56xf32>) -> tensor<1x2x58x // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[PADDED:.+]] = tensor.pad %[[ARG0]] low[0, 2, 1, 1] high[0, 2, 1, 1] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x58x58x32xf32> -// CHECK: tensor.pack %[[PADDED]] inner_dims_pos = [1] inner_tiles = [32] +// CHECK: linalg.pack %[[PADDED]] inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> // ----- @@ -722,7 +722,7 @@ func.func @multi_use_pad_pack_propagation(%arg0: tensor<1x64x56x56xf32>) -> (ten tensor.yield %cst : f32 } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32> %0 = tensor.empty() : tensor<1x2x58x58x32xf32> - %1 = tensor.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> + %1 = linalg.pack %padded inner_dims_pos = [1] inner_tiles = [32] into %0 : tensor<1x64x58x58xf32> -> tensor<1x2x58x58x32xf32> return %padded, %1 : tensor<1x64x58x58xf32>, tensor<1x2x58x58x32xf32> } @@ -730,10 +730,10 @@ func.func @multi_use_pad_pack_propagation(%arg0: tensor<1x64x56x56xf32>) -> (ten // CHECK-SAME: %[[ARG0:.+]]: tensor<1x64x56x56xf32>) // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x56x56x32xf32> -// CHECK: %[[PACKED:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [32] +// CHECK: %[[PACKED:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [32] // CHECK-SAME: into %[[EMPTY]] : tensor<1x64x56x56xf32> -> tensor<1x2x56x56x32xf32> // CHECK: %[[PADDED:.+]] = tensor.pad %[[PACKED]] low[0, 0, 1, 1, 0] high[0, 0, 1, 1, 0] -// CHECK: %[[UNPACKED:.+]] = tensor.unpack %[[PADDED]] inner_dims_pos = [1] inner_tiles = [32] +// CHECK: %[[UNPACKED:.+]] = linalg.unpack %[[PADDED]] inner_dims_pos = [1] inner_tiles = [32] // CHECK: return %[[UNPACKED]], %[[PADDED]] // ----- @@ -749,7 +749,7 @@ func.func @would_break_dominance(%arg0: tensor<128x256xi32>) -> tensor<4x16x16x3 linalg.yield %4 : i32 } -> tensor<128x256xi32> %dest = bufferization.alloc_tensor() : tensor<4x16x16x32xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %dest : tensor<128x256xi32> -> tensor<4x16x16x32xi32> @@ -763,7 +763,7 @@ func.func @would_break_dominance(%arg0: tensor<128x256xi32>) -> tensor<4x16x16x3 // CHECK-SAME: ins(%[[ARG0]] // CHECK-SAME: outs(%[[EMPTY]] // CHECK: %[[ALLOC:.+]] = bufferization.alloc_tensor() : tensor<4x16x16x32xi32> -// CHECK-NEXT: %{{.+}} = tensor.pack %[[GEN]] +// CHECK-NEXT: %{{.+}} = linalg.pack %[[GEN]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ALLOC]] @@ -779,7 +779,7 @@ func.func @scalar_tensor(%arg0 : tensor) -> tensor<1x32x7x7x32xf32> { linalg.yield %in : f32 } -> tensor<1x7x7x1024xf32> %empty_pack = tensor.empty() : tensor<1x32x7x7x32xf32> - %pack = tensor.pack %gen outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %empty_pack : tensor<1x7x7x1024xf32> -> tensor<1x32x7x7x32xf32> + %pack = linalg.pack %gen outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [32] into %empty_pack : tensor<1x7x7x1024xf32> -> tensor<1x32x7x7x32xf32> return %pack : tensor<1x32x7x7x32xf32> } @@ -800,7 +800,7 @@ func.func @scalar_tensor(%arg0 : tensor) -> tensor<1x32x7x7x32xf32> { func.func @unpack_empty_inner_dims(%arg0: tensor<12x64x56x56xf32>) -> tensor<12x56x56x64xf32> { %init = tensor.empty() : tensor<12x56x56x64xf32> %0 = tensor.empty() : tensor<12x56x56x64xf32> - %1 = tensor.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] into %0 : tensor<12x64x56x56xf32> -> tensor<12x56x56x64xf32> + %1 = linalg.unpack %arg0 outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] into %0 : tensor<12x64x56x56xf32> -> tensor<12x56x56x64xf32> %2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1: tensor<12x56x56x64xf32>) outs(%init : tensor<12x56x56x64xf32>) { ^bb0(%in: f32, %out: f32): %3 = arith.addf %in, %in : f32 @@ -810,13 +810,13 @@ func.func @unpack_empty_inner_dims(%arg0: tensor<12x64x56x56xf32>) -> tensor<12x } // CHECK-LABEL: func.func @unpack_empty_inner_dims -// CHECK: %[[UNPACKED_ARG0:.+]] = tensor.unpack +// CHECK: %[[UNPACKED_ARG0:.+]] = linalg.unpack // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[UNPACKED_ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[UNPACKED_ARG0]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] // CHECK: %[[RES:.+]] = linalg.generic // CHECK-SAME: ins(%[[PACKED_ARG0]] -// CHECK: %[[UNPACKED:.+]] = tensor.unpack %[[RES]] +// CHECK: %[[UNPACKED:.+]] = linalg.unpack %[[RES]] // CHECK-SAME: outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [] inner_tiles = [] // ----- @@ -833,7 +833,7 @@ func.func @reduction_pack_transpose_inner_dims(%arg0: tensor<128x256x32xi32>, linalg.yield %4 : i32 } -> tensor<128x256xi32> %dest = tensor.empty() : tensor<4x16x16x32xi32> - %pack = tensor.pack %elem + %pack = linalg.pack %elem inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %dest : tensor<128x256xi32> -> tensor<4x16x16x32xi32> @@ -845,11 +845,11 @@ func.func @reduction_pack_transpose_inner_dims(%arg0: tensor<128x256x32xi32>, // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] // CHECK: %[[ARG1_EMPTY:.+]] = tensor.empty() : tensor<4x16x16x32xi32> -// CHECK: %[[PACK_ARG1:.+]] = tensor.pack %[[ARG1]] +// CHECK: %[[PACK_ARG1:.+]] = linalg.pack %[[ARG1]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG1_EMPTY]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x32x16x32xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[RED:.+]] = linalg.generic @@ -879,7 +879,7 @@ func.func @reduction_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %a linalg.yield %2 : i32 } -> tensor<100x128x256xi32> %init_pack = tensor.empty() : tensor<4x16x100x16x32xi32> - %4 = tensor.pack %reduction + %4 = linalg.pack %reduction outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 32] @@ -897,15 +897,15 @@ func.func @reduction_pack_with_outer_dims(%arg0: tensor<100x128x200x256xi32>, %a // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] // CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]] // CHECK: %[[ARG3_EMPTY:.+]] = tensor.empty() : tensor<4x16x100x16x32xi32> -// CHECK: %[[PACKED_ARG3:.+]] = tensor.pack %[[ARG3]] +// CHECK: %[[PACKED_ARG3:.+]] = linalg.pack %[[ARG3]] // CHECK-SAME: outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG3_EMPTY]] // CHECK: %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x200x100x16x32xi32> -// CHECK: %[[PACKED_ARG0:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACKED_ARG0:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 3, 2, 0] inner_dims_pos = [3, 1] inner_tiles = [16, 32] // CHECK-SAME: into %[[ARG0_EMPTY]] // CHECK: %[[ARG2_EMPTY:.+]] = tensor.empty() : tensor<4x32xi32> -// CHECK: %[[PACKED_ARG2:.+]] = tensor.pack %[[ARG2]] +// CHECK: %[[PACKED_ARG2:.+]] = linalg.pack %[[ARG2]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[ARG2_EMPTY]] // CHECK: %[[RES:.+]] = linalg.generic @@ -922,7 +922,7 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32 %filter: tensor<2x2xi32>) -> tensor<16x540x960xi32>{ %init = tensor.empty() : tensor<16x540x960xi32> %empty = tensor.empty() : tensor<1x16x1080x1920xi32> - %unpack = tensor.unpack %arg0 + %unpack = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [16] into %empty : tensor<1x1x1080x1920x16xi32> -> tensor<1x16x1080x1920xi32> @@ -944,7 +944,7 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32 // CHECK: %[[FINAL_RES:.+]] = tensor.empty() : tensor<16x540x960xi32> // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x540x960x16xi32> // CHECK: %[[PACK_EMPTY:.+]] = tensor.empty() : tensor<1x1x1080x1920x16xi32> -// CHECK: %[[PACK_ARG0:.+]] = tensor.pack +// CHECK: %[[PACK_ARG0:.+]] = linalg.pack // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [16] // CHECK-SAME: into %[[PACK_EMPTY]] // CHECK: %[[POOL:.+]] = linalg.generic @@ -952,7 +952,7 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32 // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"] // CHECK-SAME: ins(%[[PACK_ARG0]], %[[ARG1]] // CHECK-SAME: outs(%[[INIT]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[POOL]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[POOL]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [16] // CHECK-SAME: into %[[FINAL_RES]] // CHECK: return %[[UNPACK]] : tensor<16x540x960xi32> @@ -962,7 +962,7 @@ func.func @unpack_different_destination_shape(%arg0: tensor<1x1x1080x1920x16xi32 func.func @bubble_up_pack_through_collapse(%1: tensor, %dim : index) -> tensor { %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor into tensor %2 = tensor.empty(%dim) : tensor - %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor -> tensor + %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor -> tensor func.return %pack : tensor } // CHECK-LABEL: func.func @bubble_up_pack_through_collapse @@ -971,7 +971,7 @@ func.func @bubble_up_pack_through_collapse(%1: tensor, %dim : index) // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor -> tensor // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] : tensor into tensor // CHECK: return %[[COLLAPSED]] : tensor @@ -980,7 +980,7 @@ func.func @bubble_up_pack_through_collapse(%1: tensor, %dim : index) func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm(%1: tensor, %dim : index) -> tensor { %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor into tensor %2 = tensor.empty(%dim) : tensor - %pack = tensor.pack %collapsed inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor -> tensor + %pack = linalg.pack %collapsed inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor -> tensor func.return %pack : tensor } // CHECK-LABEL: func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm @@ -989,7 +989,7 @@ func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm(%1: tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor -> tensor // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] : tensor into tensor // CHECK: return %[[COLLAPSED]] : tensor @@ -998,13 +998,13 @@ func.func @bubble_up_pack_through_collapse_empty_outer_dims_perm(%1: tensor) -> tensor<4x32x3072x8x1xf32> { %collapsed = tensor.collapse_shape %1 [[0], [1, 2], [3]] : tensor<4x192x16x256xf32> into tensor<4x3072x256xf32> %2 = tensor.empty() : tensor<4x32x3072x8x1xf32> - %pack = tensor.pack %collapsed outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 1] into %2 : tensor<4x3072x256xf32> -> tensor<4x32x3072x8x1xf32> + %pack = linalg.pack %collapsed outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 1] into %2 : tensor<4x3072x256xf32> -> tensor<4x32x3072x8x1xf32> func.return %pack : tensor<4x32x3072x8x1xf32> } // CHECK-LABEL: func.func @bubble_up_permuted_pack_through_collapse // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<4x32x192x16x8x1xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<4x192x16x256xf32> -> tensor<4x32x192x16x8x1xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<4x192x16x256xf32> -> tensor<4x32x192x16x8x1xf32> // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %pack {{\[}}[0], [1], [2, 3], [4], [5]] : tensor<4x32x192x16x8x1xf32> into tensor<4x32x3072x8x1xf32> // CHECK: return %[[COLLAPSED]] : tensor<4x32x3072x8x1xf32> @@ -1013,13 +1013,13 @@ func.func @bubble_up_permuted_pack_through_collapse(%1: tensor<4x192x16x256xf32> func.func @bubble_up_pack_through_unit_collapse(%1: tensor<1x64x1x4xf32>) -> tensor<8x4x8x1xf32> { %collapsed = tensor.collapse_shape %1 [[0, 1, 2], [3]] : tensor<1x64x1x4xf32> into tensor<64x4xf32> %2 = tensor.empty() : tensor<8x4x8x1xf32> - %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor<64x4xf32> -> tensor<8x4x8x1xf32> + %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %2 : tensor<64x4xf32> -> tensor<8x4x8x1xf32> func.return %pack : tensor<8x4x8x1xf32> } // CHECK-LABEL: func.func @bubble_up_pack_through_unit_collapse // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x8x1x4x8x1xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<1x64x1x4xf32> -> tensor<1x8x1x4x8x1xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<1x64x1x4xf32> -> tensor<1x8x1x4x8x1xf32> // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1, 2], [3], [4], [5]] : tensor<1x8x1x4x8x1xf32> into tensor<8x4x8x1xf32> // CHECK: return %[[COLLAPSED]] : tensor<8x4x8x1xf32> @@ -1028,7 +1028,7 @@ func.func @bubble_up_pack_through_unit_collapse(%1: tensor<1x64x1x4xf32>) -> ten func.func @bubble_up_pack_through_collapse_on_outer_dims(%1: tensor, %dim : index) -> tensor { %collapsed = tensor.collapse_shape %1 [[0, 1], [2]] : tensor into tensor %2 = tensor.empty(%dim) : tensor - %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [4] into %2 : tensor -> tensor + %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [4] into %2 : tensor -> tensor func.return %pack : tensor } // CHECK-LABEL: func.func @bubble_up_pack_through_collapse_on_outer_dims @@ -1037,7 +1037,7 @@ func.func @bubble_up_pack_through_collapse_on_outer_dims(%1: tensor, // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [4] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [4] into %[[EMPTY]] : tensor -> tensor // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[PACK]] {{\[}}[0, 1], [2], [3]] : tensor into tensor // CHECK: return %[[COLLAPSED]] : tensor @@ -1046,13 +1046,13 @@ func.func @bubble_up_pack_through_collapse_on_outer_dims(%1: tensor, func.func @no_bubble_up_pack_through_non_divisible_collapse(%1: tensor<3072x64x4xf32>) -> tensor<384x32x8x8xf32> { %collapsed = tensor.collapse_shape %1 [[0], [1, 2]] : tensor<3072x64x4xf32> into tensor<3072x256xf32> %2 = tensor.empty() : tensor<384x32x8x8xf32> - %pack = tensor.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %2 : tensor<3072x256xf32> -> tensor<384x32x8x8xf32> + %pack = linalg.pack %collapsed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %2 : tensor<3072x256xf32> -> tensor<384x32x8x8xf32> func.return %pack : tensor<384x32x8x8xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_through_non_divisible_collapse // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<3072x64x4xf32> into tensor<3072x256xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[COLLAPSED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[COLLAPSED]] // CHECK: return %[[PACK]] : tensor<384x32x8x8xf32> // ----- @@ -1060,13 +1060,13 @@ func.func @no_bubble_up_pack_through_non_divisible_collapse(%1: tensor<3072x64x4 func.func @bubble_up_pack_outer_expanded_through_expand(%arg0: tensor<32x64xf32>) -> tensor<4x2x64x4xf32> { %empty = tensor.empty() : tensor<4x2x64x4xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded inner_dims_pos = [1] inner_tiles = [4] into %empty : tensor<4x8x64xf32> -> tensor<4x2x64x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [1] inner_tiles = [4] into %empty : tensor<4x8x64xf32> -> tensor<4x2x64x4xf32> return %pack : tensor<4x2x64x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_outer_expanded_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x64x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [4] into %[[EMPTY]] : tensor<32x64xf32> -> tensor<8x64x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2], [3]] // CHECK-SAME: output_shape [4, 2, 64, 4] : tensor<8x64x4xf32> into tensor<4x2x64x4xf32> @@ -1077,13 +1077,13 @@ func.func @bubble_up_pack_outer_expanded_through_expand(%arg0: tensor<32x64xf32> func.func @bubble_up_pack_inner_expanded_through_expand(%arg0: tensor<32x64xf32>) -> tensor<32x4x4x4xf32> { %empty = tensor.empty() : tensor<32x4x4x4xf32> %expanded = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [32, 4, 16] : tensor<32x64xf32> into tensor<32x4x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [2] inner_tiles = [4] into %empty : tensor<32x4x16xf32> -> tensor<32x4x4x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [2] inner_tiles = [4] into %empty : tensor<32x4x16xf32> -> tensor<32x4x4x4xf32> return %pack : tensor<32x4x4x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_inner_expanded_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x16x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [4] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64xf32> -> tensor<32x16x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0], [1, 2], [3]] @@ -1095,13 +1095,13 @@ func.func @bubble_up_pack_inner_expanded_through_expand(%arg0: tensor<32x64xf32> func.func @bubble_up_pack_non_expanded_dims_through_expand(%arg0: tensor<32x64x16xf32>) -> tensor<8x2x32x16x4xf32> { %empty = tensor.empty() : tensor<8x2x32x16x4xf32> %expanded = tensor.expand_shape %arg0 [[0], [1, 2], [3]] output_shape [32, 2, 32, 16] : tensor<32x64x16xf32> into tensor<32x2x32x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0] inner_tiles = [4] into %empty : tensor<32x2x32x16xf32> -> tensor<8x2x32x16x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0] inner_tiles = [4] into %empty : tensor<32x2x32x16xf32> -> tensor<8x2x32x16x4xf32> return %pack : tensor<8x2x32x16x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_non_expanded_dims_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x64x16x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack +// CHECK: %[[PACK:.+]] = linalg.pack // CHECK-SAME: %[[ARG0]] inner_dims_pos = [0] inner_tiles = [4] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64x16xf32> -> tensor<8x64x16x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0], [1, 2], [3], [4]] @@ -1115,7 +1115,7 @@ func.func @bubble_up_pack_through_expand_dynamic(%arg0: tensor) -> ten %dim = tensor.dim %arg0, %c0 : tensor %empty = tensor.empty(%dim) : tensor %expanded = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [%dim, 4, 16] : tensor into tensor - %pack = tensor.pack %expanded inner_dims_pos = [2] inner_tiles = [8] into %empty : tensor -> tensor + %pack = linalg.pack %expanded inner_dims_pos = [2] inner_tiles = [8] into %empty : tensor -> tensor return %pack : tensor } // CHECK-LABEL: func.func @bubble_up_pack_through_expand_dynamic( @@ -1123,7 +1123,7 @@ func.func @bubble_up_pack_through_expand_dynamic(%arg0: tensor) -> ten // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[DIM_INPUT:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM_INPUT]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [8] into %[[EMPTY]] // CHECK-SAME: : tensor -> tensor // CHECK: %[[DIM_PACK:.+]] = tensor.dim %[[PACK]], %[[C0]] : tensor @@ -1137,14 +1137,14 @@ func.func @bubble_up_pack_non_expanded_padding_through_expand(%arg0: tensor<32x6 %cst = arith.constant 3.000000e+00 : f32 %empty = tensor.empty() : tensor<4x2x8x4x8xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x60xf32> into tensor<4x8x60xf32> - %pack = tensor.pack %expanded padding_value(%cst : f32) inner_dims_pos = [1, 2] inner_tiles = [4, 8] into %empty : tensor<4x8x60xf32> -> tensor<4x2x8x4x8xf32> + %pack = linalg.pack %expanded padding_value(%cst : f32) inner_dims_pos = [1, 2] inner_tiles = [4, 8] into %empty : tensor<4x8x60xf32> -> tensor<4x2x8x4x8xf32> return %pack : tensor<4x2x8x4x8xf32> } // CHECK-LABEL: func.func @bubble_up_pack_non_expanded_padding_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK-DAG: %[[CST:.+]] = arith.constant 3.000000e+00 : f32 // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x8x4x8xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] padding_value(%[[CST]] : f32) +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] padding_value(%[[CST]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 8] into %[[EMPTY]] // CHECK-SAME: : tensor<32x60xf32> -> tensor<8x8x4x8xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] @@ -1156,13 +1156,13 @@ func.func @bubble_up_pack_non_expanded_padding_through_expand(%arg0: tensor<32x6 func.func @bubble_up_pack_outer_dims_perm_identity_through_expand(%arg0: tensor<32x64xf32>) -> tensor<4x2x32x4x2xf32> { %empty = tensor.empty() : tensor<4x2x32x4x2xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %empty : tensor<4x8x64xf32> -> tensor<4x2x32x4x2xf32> + %pack = linalg.pack %expanded outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %empty : tensor<4x8x64xf32> -> tensor<4x2x32x4x2xf32> return %pack : tensor<4x2x32x4x2xf32> } // CHECK-LABEL: func.func @bubble_up_pack_outer_dims_perm_identity_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x32x4x2xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 2] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64xf32> -> tensor<8x32x4x2xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] @@ -1174,13 +1174,13 @@ func.func @bubble_up_pack_outer_dims_perm_identity_through_expand(%arg0: tensor< func.func @bubble_up_pack_multiple_dims_through_expand(%arg0: tensor<32x64x16xf32>) -> tensor<8x2x4x8x4x8x2xf32> { %empty = tensor.empty() : tensor<8x2x4x8x4x8x2xf32> %expanded = tensor.expand_shape %arg0 [[0], [1, 2], [3]] output_shape [32, 2, 32, 16] : tensor<32x64x16xf32> into tensor<32x2x32x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0, 2, 3] inner_tiles = [4, 8, 2] into %empty : tensor<32x2x32x16xf32> -> tensor<8x2x4x8x4x8x2xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0, 2, 3] inner_tiles = [4, 8, 2] into %empty : tensor<32x2x32x16xf32> -> tensor<8x2x4x8x4x8x2xf32> return %pack : tensor<8x2x4x8x4x8x2xf32> } // CHECK-LABEL: func.func @bubble_up_pack_multiple_dims_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x8x8x4x8x2xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0, 1, 2] inner_tiles = [4, 8, 2] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64x16xf32> -> tensor<8x8x8x4x8x2xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0], [1, 2], [3], [4], [5], [6]] @@ -1192,13 +1192,13 @@ func.func @bubble_up_pack_multiple_dims_through_expand(%arg0: tensor<32x64x16xf3 func.func @bubble_up_pack_inner_dims_reorder_through_expand(%arg0: tensor<32x64xf32>) -> tensor<4x2x4x16x4xf32> { %empty = tensor.empty() : tensor<4x2x4x16x4xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded inner_dims_pos = [2, 1] inner_tiles = [16, 4] into %empty : tensor<4x8x64xf32> -> tensor<4x2x4x16x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [2, 1] inner_tiles = [16, 4] into %empty : tensor<4x8x64xf32> -> tensor<4x2x4x16x4xf32> return %pack : tensor<4x2x4x16x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_inner_dims_reorder_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x4x16x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64xf32> -> tensor<8x4x16x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2], [3], [4]] @@ -1210,13 +1210,13 @@ func.func @bubble_up_pack_inner_dims_reorder_through_expand(%arg0: tensor<32x64x func.func @bubble_up_pack_multiple_different_expanded_dims_through_expand(%arg0: tensor<32x64x16xf32>) -> tensor<4x2x2x8x16x4x4xf32> { %empty = tensor.empty() : tensor<4x2x2x8x16x4x4xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3], [4]] output_shape [4, 8, 2, 32, 16] : tensor<32x64x16xf32> into tensor<4x8x2x32x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [1, 3] inner_tiles = [4, 4] into %empty : tensor<4x8x2x32x16xf32> -> tensor<4x2x2x8x16x4x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [1, 3] inner_tiles = [4, 4] into %empty : tensor<4x8x2x32x16xf32> -> tensor<4x2x2x8x16x4x4xf32> return %pack : tensor<4x2x2x8x16x4x4xf32> } // CHECK-LABEL: func.func @bubble_up_pack_multiple_different_expanded_dims_through_expand( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x16x16x4x4xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %[[EMPTY]] // CHECK-SAME: : tensor<32x64x16xf32> -> tensor<8x16x16x4x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[PACK]] {{\[}}[0, 1], [2, 3], [4], [5], [6]] @@ -1228,7 +1228,7 @@ func.func @bubble_up_pack_multiple_different_expanded_dims_through_expand(%arg0: func.func @no_bubble_up_pack_outer_dims_permutation_through_expand(%arg0: tensor<32x64xf32>) -> tensor<32x4x2x4x2xf32> { %empty = tensor.empty() : tensor<32x4x2x4x2xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %empty : tensor<4x8x64xf32> -> tensor<32x4x2x4x2xf32> + %pack = linalg.pack %expanded outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %empty : tensor<4x8x64xf32> -> tensor<32x4x2x4x2xf32> return %pack : tensor<32x4x2x4x2xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_outer_dims_permutation_through_expand( @@ -1236,7 +1236,7 @@ func.func @no_bubble_up_pack_outer_dims_permutation_through_expand(%arg0: tensor // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x4x2x4x2xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2]] // CHECK-SAME: output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] // CHECK-SAME: outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [4, 2] into %[[EMPTY]] // CHECK-SAME: : tensor<4x8x64xf32> -> tensor<32x4x2x4x2xf32> // CHECK: return %[[PACK]] : tensor<32x4x2x4x2xf32> @@ -1246,7 +1246,7 @@ func.func @no_bubble_up_pack_outer_dims_permutation_through_expand(%arg0: tensor func.func @no_bubble_up_pack_multiple_same_expanded_dim_through_expand(%arg0: tensor<32x64xf32>) -> tensor<2x2x64x2x4xf32> { %empty = tensor.empty() : tensor<2x2x64x2x4xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %empty : tensor<4x8x64xf32> -> tensor<2x2x64x2x4xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %empty : tensor<4x8x64xf32> -> tensor<2x2x64x2x4xf32> return %pack : tensor<2x2x64x2x4xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_multiple_same_expanded_dim_through_expand( @@ -1254,7 +1254,7 @@ func.func @no_bubble_up_pack_multiple_same_expanded_dim_through_expand(%arg0: te // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2x2x64x2x4xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2]] // CHECK-SAME: output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %[[EMPTY]] // CHECK-SAME: : tensor<4x8x64xf32> -> tensor<2x2x64x2x4xf32> // CHECK: return %[[PACK]] : tensor<2x2x64x2x4xf32> @@ -1264,7 +1264,7 @@ func.func @no_bubble_up_pack_multiple_same_expanded_dim_through_expand(%arg0: te func.func @no_bubble_up_pack_non_innermost_expanded_dim_through_expand(%arg0: tensor<32x64xf32>) -> tensor<2x8x64x2xf32> { %empty = tensor.empty() : tensor<2x8x64x2xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0] inner_tiles = [2] into %empty : tensor<4x8x64xf32> -> tensor<2x8x64x2xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0] inner_tiles = [2] into %empty : tensor<4x8x64xf32> -> tensor<2x8x64x2xf32> return %pack : tensor<2x8x64x2xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_non_innermost_expanded_dim_through_expand( @@ -1272,7 +1272,7 @@ func.func @no_bubble_up_pack_non_innermost_expanded_dim_through_expand(%arg0: te // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2x8x64x2xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2]] // CHECK-SAME: output_shape [4, 8, 64] : tensor<32x64xf32> into tensor<4x8x64xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [2] into %[[EMPTY]] // CHECK-SAME: : tensor<4x8x64xf32> -> tensor<2x8x64x2xf32> // CHECK: return %[[PACK]] : tensor<2x8x64x2xf32> @@ -1283,7 +1283,7 @@ func.func @no_bubble_up_pack_expanded_padding_through_expand_cannot_reassociate( %cst = arith.constant 3.000000e+00 : f32 %empty = tensor.empty() : tensor<3x2x60x8xf32> %expanded = tensor.expand_shape %arg0 [[0, 1], [2]] output_shape [3, 10, 60] : tensor<30x60xf32> into tensor<3x10x60xf32> - %pack = tensor.pack %expanded padding_value(%cst : f32) inner_dims_pos = [1] inner_tiles = [8] into %empty : tensor<3x10x60xf32> -> tensor<3x2x60x8xf32> + %pack = linalg.pack %expanded padding_value(%cst : f32) inner_dims_pos = [1] inner_tiles = [8] into %empty : tensor<3x10x60xf32> -> tensor<3x2x60x8xf32> return %pack : tensor<3x2x60x8xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_expanded_padding_through_expand_cannot_reassociate( @@ -1292,7 +1292,7 @@ func.func @no_bubble_up_pack_expanded_padding_through_expand_cannot_reassociate( // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<3x2x60x8xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2]] // CHECK-SAME: output_shape [3, 10, 60] : tensor<30x60xf32> into tensor<3x10x60xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] padding_value(%[[CST]] : f32) +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] padding_value(%[[CST]] : f32) // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [8] into %[[EMPTY]] // CHECK-SAME: : tensor<3x10x60xf32> -> tensor<3x2x60x8xf32> // CHECK: return %[[PACK]] : tensor<3x2x60x8xf32> @@ -1302,7 +1302,7 @@ func.func @no_bubble_up_pack_expanded_padding_through_expand_cannot_reassociate( func.func @no_bubble_up_pack_extending_dimension_through_expand_cannot_reassociate(%arg0: tensor<32x64xf32>) -> tensor<8x4x16x8xf32> { %empty = tensor.empty() : tensor<8x4x16x8xf32> %expanded = tensor.expand_shape %arg0 [[0], [1, 2]] output_shape [32, 4, 16] : tensor<32x64xf32> into tensor<32x4x16xf32> - %pack = tensor.pack %expanded inner_dims_pos = [0] inner_tiles = [8] into %empty : tensor<32x4x16xf32> -> tensor<8x4x16x8xf32> + %pack = linalg.pack %expanded inner_dims_pos = [0] inner_tiles = [8] into %empty : tensor<32x4x16xf32> -> tensor<8x4x16x8xf32> return %pack : tensor<8x4x16x8xf32> } // CHECK-LABEL: func.func @no_bubble_up_pack_extending_dimension_through_expand_cannot_reassociate( @@ -1310,7 +1310,7 @@ func.func @no_bubble_up_pack_extending_dimension_through_expand_cannot_reassocia // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x4x16x8xf32> // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] // CHECK-SAME: output_shape [32, 4, 16] : tensor<32x64xf32> into tensor<32x4x16xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[EXPANDED]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[EXPANDED]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [8] into %[[EMPTY]] // CHECK-SAME: : tensor<32x4x16xf32> -> tensor<8x4x16x8xf32> // CHECK: return %[[PACK]] : tensor<8x4x16x8xf32> @@ -1319,7 +1319,7 @@ func.func @no_bubble_up_pack_extending_dimension_through_expand_cannot_reassocia func.func @push_down_unpack_through_expand(%5: tensor, %dim: index, %sz0: index) -> tensor { %6 = tensor.empty(%dim) : tensor - %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor -> tensor + %unpack = linalg.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor -> tensor %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor into tensor func.return %expanded : tensor } @@ -1333,14 +1333,14 @@ func.func @push_down_unpack_through_expand(%5: tensor, %dim: index // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] output_shape [%[[SZ0]], 32, 32, 8, 8] : tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor -> tensor // CHECK: return %[[UNPACK]] : tensor // ----- func.func @push_down_unpack_through_expand_empty_outer_dims_perm(%5: tensor, %dim: index, %sz0: index) -> tensor { %6 = tensor.empty(%dim) : tensor - %unpack = tensor.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor -> tensor + %unpack = linalg.unpack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor -> tensor %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor into tensor func.return %expanded : tensor } @@ -1354,14 +1354,14 @@ func.func @push_down_unpack_through_expand_empty_outer_dims_perm(%5: tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED:.+]] inner_dims_pos = [1, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor -> tensor // CHECK: return %[[UNPACK]] : tensor // ----- func.func @push_down_permuted_unpack_through_expand(%5: tensor<4x32x384x8x8xf32>) -> tensor<4x12x256x256xf32> { %6 = tensor.empty() : tensor<4x3072x256xf32> - %unpack = tensor.unpack %5 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 8] into %6 : tensor<4x32x384x8x8xf32> -> tensor<4x3072x256xf32> + %unpack = linalg.unpack %5 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 8] into %6 : tensor<4x32x384x8x8xf32> -> tensor<4x3072x256xf32> %expanded = tensor.expand_shape %unpack [[0], [1, 2], [3]] output_shape [4, 12, 256, 256] : tensor<4x3072x256xf32> into tensor<4x12x256x256xf32> func.return %expanded : tensor<4x12x256x256xf32> } @@ -1369,14 +1369,14 @@ func.func @push_down_permuted_unpack_through_expand(%5: tensor<4x32x384x8x8xf32> // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1], [2, 3], [4], [5]] output_shape [4, 32, 12, 32, 8, 8] : tensor<4x32x384x8x8xf32> into tensor<4x32x12x32x8x8xf32> // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<4x12x256x256xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<4x32x12x32x8x8xf32> -> tensor<4x12x256x256xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3, 2] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<4x32x12x32x8x8xf32> -> tensor<4x12x256x256xf32> // CHECK: return %[[UNPACK]] : tensor<4x12x256x256xf32> // ----- func.func @push_down_unpack_through_unit_expand(%5: tensor<6x32x8x8xf32>) -> tensor<3x16x1x256xf32> { %6 = tensor.empty() : tensor<48x256xf32> - %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<6x32x8x8xf32> -> tensor<48x256xf32> + %unpack = linalg.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<6x32x8x8xf32> -> tensor<48x256xf32> %expanded = tensor.expand_shape %unpack [[0, 1, 2], [3]] output_shape [3, 16, 1, 256] : tensor<48x256xf32> into tensor<3x16x1x256xf32> func.return %expanded : tensor<3x16x1x256xf32> } @@ -1384,14 +1384,14 @@ func.func @push_down_unpack_through_unit_expand(%5: tensor<6x32x8x8xf32>) -> ten // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3], [4], [5]] output_shape [3, 2, 1, 32, 8, 8] : tensor<6x32x8x8xf32> into tensor<3x2x1x32x8x8xf32> // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<3x16x1x256xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<3x2x1x32x8x8xf32> -> tensor<3x16x1x256xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED]] outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 3] inner_tiles = [8, 8] into %[[EMPTY]] : tensor<3x2x1x32x8x8xf32> -> tensor<3x16x1x256xf32> // CHECK: return %[[UNPACK]] : tensor<3x16x1x256xf32> // ----- func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor, %dim: index, %sz0: index) -> tensor { %6 = tensor.empty(%dim) : tensor - %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [8] into %6 : tensor -> tensor + %unpack = linalg.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [8] into %6 : tensor -> tensor %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [%sz0, 256, 256] : tensor into tensor func.return %expanded : tensor } @@ -1405,19 +1405,19 @@ func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor, // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3]] output_shape [%[[SZ0]], 256, 32, 8] : tensor into tensor // CHECK: %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [8] into %[[EMPTY]] : tensor -> tensor +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[EXPANDED:.+]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [2] inner_tiles = [8] into %[[EMPTY]] : tensor -> tensor // CHECK: return %[[UNPACK]] : tensor // ----- func.func @no_push_down_unpack_through_non_divisible_expand(%5: tensor<384x32x8x8xf32>) -> tensor<256x12x256xf32> { %6 = tensor.empty() : tensor<3072x256xf32> - %unpack = tensor.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<384x32x8x8xf32> -> tensor<3072x256xf32> + %unpack = linalg.unpack %5 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %6 : tensor<384x32x8x8xf32> -> tensor<3072x256xf32> %expanded = tensor.expand_shape %unpack [[0, 1], [2]] output_shape [256, 12, 256] : tensor<3072x256xf32> into tensor<256x12x256xf32> func.return %expanded : tensor<256x12x256xf32> } // CHECK-LABEL: func.func @no_push_down_unpack_through_non_divisible_expand // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[UNPACK]] {{\[}}[0, 1], [2]] output_shape [256, 12, 256] : tensor<3072x256xf32> into tensor<256x12x256xf32> // CHECK: return %[[EXPANDED]] : tensor<256x12x256xf32> diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir index ec761d9a49436..72fde5490a305 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-pack-tile.mlir @@ -4,7 +4,7 @@ // RUN: -transform-interpreter %s | FileCheck %s func.func @KCRS_to_KCRSsr(%arg0: tensor<1x1x128x64xf32>, %arg1: tensor<1x1x4x8x8x32xf32>) -> tensor<1x1x4x8x8x32xf32> { - %0 = tensor.pack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x128x64xf32> -> tensor<1x1x4x8x8x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x128x64xf32> -> tensor<1x1x4x8x8x32xf32> return %0 : tensor<1x1x4x8x8x32xf32> } // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 32)> @@ -27,7 +27,7 @@ func.func @KCRS_to_KCRSsr(%arg0: tensor<1x1x128x64xf32>, %arg1: tensor<1x1x4x8x8 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -36,7 +36,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @pad_and_pack(%arg0: tensor<13x15xf32>, %arg1: tensor<2x8x8x2xf32>, %arg2: f32) -> tensor<2x8x8x2xf32> { - %0 = tensor.pack %arg0 padding_value(%arg2 : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %arg1 : tensor<13x15xf32> -> tensor<2x8x8x2xf32> + %0 = linalg.pack %arg0 padding_value(%arg2 : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %arg1 : tensor<13x15xf32> -> tensor<2x8x8x2xf32> return %0 : tensor<2x8x8x2xf32> } // CHECK: func.func @pad_and_pack @@ -54,7 +54,7 @@ func.func @pad_and_pack(%arg0: tensor<13x15xf32>, %arg1: tensor<2x8x8x2xf32>, %a module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -64,7 +64,7 @@ module attributes {transform.with_named_sequence} { func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) -> tensor<32x4x32x8xf32> { - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<128x256xf32> -> tensor<32x4x32x8xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<128x256xf32> -> tensor<32x4x32x8xf32> return %0 : tensor<32x4x32x8xf32> } // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 32)> @@ -85,7 +85,7 @@ func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) // CHECK-SAME: [%[[C]], %[[K]], 0, 0] [1, 1, 32, 8] [1, 1, 1, 1] : tensor<1x1x32x8xf32> into tensor<32x4x32x8xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir index 1cc1484ed4095..911b453f919c3 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-pack.mlir @@ -5,7 +5,7 @@ func.func @simple_KCRS_to_KCRSsr(%arg0: tensor, %arg1: tensor<1x1x?x1xi32>) -> tensor<1x1x?x1xi32> { %c8 = arith.constant 8 : index %c5 = arith.constant 5 : i32 - %pack = tensor.pack %arg0 padding_value(%c5 : i32) inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %arg1 : tensor -> tensor<1x1x?x1xi32> + %pack = linalg.pack %arg0 padding_value(%c5 : i32) inner_dims_pos = [0, 1] inner_tiles = [%c8, 1] into %arg1 : tensor -> tensor<1x1x?x1xi32> return %pack : tensor<1x1x?x1xi32> } @@ -32,7 +32,7 @@ func.func @simple_KCRS_to_KCRSsr(%arg0: tensor, %arg1: tensor<1x1x?x1xi // ----- func.func @simple_pad_and_pack_static_tiles(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2xf32>, %pad: f32) -> tensor<1x1x8x2xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<5x1xf32> -> tensor<1x1x8x2xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<5x1xf32> -> tensor<1x1x8x2xf32> return %0 : tensor<1x1x8x2xf32> } // CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0] -> (s0 - 5)> @@ -52,7 +52,7 @@ func.func @simple_pad_and_pack_static_tiles(%input: tensor<5x1xf32>, %output: te /// Same as example above, but with 1 dynamic tile size. func.func @simple_pad_and_pack_dynamic_tile(%input: tensor<5x1xf32>, %output: tensor<1x1x?x2xf32>, %pad: f32, %tile_dim_0: index) -> tensor<1x1x?x2xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> return %0 : tensor<1x1x?x2xf32> } // CHECK-LABEL: func.func @simple_pad_and_pack_dynamic_tile( @@ -72,7 +72,7 @@ func.func @simple_pad_and_pack_dynamic_tile(%input: tensor<5x1xf32>, %output: te func.func @simple_pad_and_pack_dynamic_tile_cst(%input: tensor<5x1xf32>, %output: tensor<1x1x?x2xf32>, %pad: f32) -> tensor<1x1x?x2xf32> { %tile_dim_0 = arith.constant 8 : index - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> return %0 : tensor<1x1x?x2xf32> } // CHECK-LABEL: func.func @simple_pad_and_pack_dynamic_tile_cst( @@ -86,7 +86,7 @@ func.func @simple_pad_and_pack_dynamic_tile_cst(%input: tensor<5x1xf32>, %output // CHECK: return %[[RES]] : tensor<1x1x?x2xf32> func.func @simple_pad_and_pack_dynamic_tile_transpose(%input: tensor<5x1xf32>, %output: tensor<1x1x2x?xf32>, %pad: f32, %tile_dim_1: index) -> tensor<1x1x2x?xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [1, 0] inner_tiles = [2, %tile_dim_1] into %output : tensor<5x1xf32> -> tensor<1x1x2x?xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [1, 0] inner_tiles = [2, %tile_dim_1] into %output : tensor<5x1xf32> -> tensor<1x1x2x?xf32> return %0 : tensor<1x1x2x?xf32> } // CHECK-LABEL: func.func @simple_pad_and_pack_dynamic_tile_transpose( @@ -116,7 +116,7 @@ func.func @simple_pad_and_pack_scalable_tile(%input: tensor<5x1xf32>, %output: t %c8 = arith.constant 8 : index %vscale = vector.vscale %c8_vscale = arith.muli %vscale, %c8 : index - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32> return %0 : tensor<1x1x?x2xf32> } @@ -138,7 +138,7 @@ func.func @simple_pad_and_pack_scalable_tile(%input: tensor<5x1xf32>, %output: t /// Same as example above, but with both tile sizes dynamic. func.func @simple_pad_and_pack_dynamic_tiles(%input: tensor<5x1xf32>, %output: tensor<1x1x?x?xf32>, %pad: f32, %tile_dim_0: index, %tile_dim_1: index) -> tensor<1x1x?x?xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, %tile_dim_1] into %output : tensor<5x1xf32> -> tensor<1x1x?x?xf32> + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_dim_0, %tile_dim_1] into %output : tensor<5x1xf32> -> tensor<1x1x?x?xf32> return %0 : tensor<1x1x?x?xf32> } // CHECK-LABEL: func.func @simple_pad_and_pack_dynamic_tiles( @@ -158,7 +158,7 @@ func.func @simple_pad_and_pack_dynamic_tiles(%input: tensor<5x1xf32>, %output: t // ----- func.func @simple_pad_and_pack_dynamic_tile_not_all_dims_tiled(%input: tensor<1x1x5x1xf32>, %output: tensor<1x1x1x1x2x?xf32>, %pad: f32, %high: index) -> tensor<1x1x1x1x2x?xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) outer_dims_perm = [1, 0, 2, 3] inner_dims_pos = [3, 2] inner_tiles = [2, %high] into %output : tensor<1x1x5x1xf32> -> tensor<1x1x1x1x2x?xf32> + %0 = linalg.pack %input padding_value(%pad : f32) outer_dims_perm = [1, 0, 2, 3] inner_dims_pos = [3, 2] inner_tiles = [2, %high] into %output : tensor<1x1x5x1xf32> -> tensor<1x1x1x1x2x?xf32> return %0 : tensor<1x1x1x1x2x?xf32> } // CHECK: #[[$ATTR_2:.+]] = affine_map<()[s0] -> (s0 - 5)> @@ -183,7 +183,7 @@ func.func @simple_pad_and_pack_dynamic_tile_not_all_dims_tiled(%input: tensor<1x // ----- func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32>{ - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x8xf32> -> tensor<1x1x32x8xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x8xf32> -> tensor<1x1x32x8xf32> return %0 : tensor<1x1x32x8xf32> } // CHECK-LABEL: func.func @simple_NC_to_CNnc @@ -197,7 +197,7 @@ func.func @simple_NC_to_CNnc(%arg0: tensor<32x8xf32>, %arg1: tensor<1x1x32x8xf32 // ----- func.func @simple_CHW_to_CHWhwc(%arg0: tensor<3x5x7xf32>, %arg1: tensor<1x1x1x5x7x3xf32>) -> tensor<1x1x1x5x7x3xf32> { - %0 = tensor.pack %arg0 inner_dims_pos = [1, 2, 0] inner_tiles = [5, 7, 3] into %arg1 : tensor<3x5x7xf32> -> tensor<1x1x1x5x7x3xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1, 2, 0] inner_tiles = [5, 7, 3] into %arg1 : tensor<3x5x7xf32> -> tensor<1x1x1x5x7x3xf32> return %0 : tensor<1x1x1x5x7x3xf32> } // CHECK-LABEL: func.func @simple_CHW_to_CHWhwc @@ -215,7 +215,7 @@ func.func @simple_CHW_to_CHWhwc(%arg0: tensor<3x5x7xf32>, %arg1: tensor<1x1x1x5x // ----- func.func @simple_KCRS_to_KRSCsr(%arg0: tensor<1x1x32x8xf32>, %arg1: tensor<1x1x1x1x8x32xf32>) -> tensor<1x1x1x1x8x32xf32> { - %0 = tensor.pack %arg0 outer_dims_perm = [0, 2, 3, 1] inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x32x8xf32> -> tensor<1x1x1x1x8x32xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [0, 2, 3, 1] inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x32x8xf32> -> tensor<1x1x1x1x8x32xf32> return %0 : tensor<1x1x1x1x8x32xf32> } // CHECK-LABEL: func.func @simple_KCRS_to_KRSCsr diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir index 0dbdf470bbfc9..03437223f0d45 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack-tile.mlir @@ -4,13 +4,13 @@ // RUN: -transform-interpreter %s | FileCheck %s func.func @KCRSsr_to_KCRS(%arg0: tensor<1x1x4x8x8x32xf32>, %arg1: tensor<1x1x128x64xf32>) -> tensor<1x1x128x64xf32> { - %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x4x8x8x32xf32> -> tensor<1x1x128x64xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x4x8x8x32xf32> -> tensor<1x1x128x64xf32> return %0 : tensor<1x1x128x64xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 32, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -38,7 +38,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @unpack_and_extract_slice(%arg0: tensor<2x8x8x2xf32>, %arg1: tensor<13x15xf32>) -> tensor<13x15xf32> { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %arg1 : tensor<2x8x8x2xf32> -> tensor<13x15xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %arg1 : tensor<2x8x8x2xf32> -> tensor<13x15xf32> return %0 : tensor<13x15xf32> } // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (-d0 + 13, 8)> @@ -70,7 +70,7 @@ func.func @unpack_and_extract_slice(%arg0: tensor<2x8x8x2xf32>, %arg1: tensor<13 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [8, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } @@ -79,7 +79,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @CKkc_to_KC(%arg0: tensor<32x4x32x8xf32>, %arg1: tensor<128x256xf32>) -> tensor<128x256xf32> { - %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x4x32x8xf32> -> tensor<128x256xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<32x4x32x8xf32> -> tensor<128x256xf32> return %0 : tensor<128x256xf32> } // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> @@ -102,7 +102,7 @@ func.func @CKkc_to_KC(%arg0: tensor<32x4x32x8xf32>, %arg1: tensor<128x256xf32>) module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [32, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } diff --git a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir index ba1f214952562..d460c506d6e18 100644 --- a/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir +++ b/mlir/test/Dialect/Linalg/decompose-tensor-unpack.mlir @@ -3,7 +3,7 @@ // RUN: -transform-interpreter=entry-point=decompose_unpack %s | FileCheck %s func.func @simple_KCRSsr_to_KCRS(%arg0: tensor<1x1x1x1x8x32xf32>, %arg1: tensor<1x1x32x8xf32>) -> tensor<1x1x32x8xf32> { - %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x1x1x8x32xf32> -> tensor<1x1x32x8xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor<1x1x1x1x8x32xf32> -> tensor<1x1x32x8xf32> return %0 : tensor<1x1x32x8xf32> } // CHECK-LABEL: func.func @simple_KCRSsr_to_KCRS @@ -22,7 +22,7 @@ func.func @simple_KCRSsr_to_KCRS(%arg0: tensor<1x1x1x1x8x32xf32>, %arg1: tensor< // ----- func.func @simple_unpack_static_tiles(%input: tensor<1x1x8x2xf32>, %output: tensor<5x1xf32>) -> tensor<5x1xf32> { - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<1x1x8x2xf32> -> tensor<5x1xf32> + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<1x1x8x2xf32> -> tensor<5x1xf32> return %0 : tensor<5x1xf32> } // CHECK-LABEL: func.func @simple_unpack_static_tiles @@ -38,7 +38,7 @@ func.func @simple_unpack_static_tiles(%input: tensor<1x1x8x2xf32>, %output: tens /// Same as example above, but with 1 dynamic tile size. func.func @simple_unpack_dynamic_tile(%input: tensor<1x1x?x2xf32>, %output: tensor<5x1xf32>, %tile_dim: index) -> tensor<5x1xf32> { - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%tile_dim, 2] into %output : tensor<1x1x?x2xf32> -> tensor<5x1xf32> + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%tile_dim, 2] into %output : tensor<1x1x?x2xf32> -> tensor<5x1xf32> return %0 : tensor<5x1xf32> } // CHECK-LABEL: func.func @simple_unpack_dynamic_tile @@ -55,7 +55,7 @@ func.func @simple_unpack_dynamic_tile(%input: tensor<1x1x?x2xf32>, %output: tens /// Same as example above, but with 1 dynamic tile size and a trasnpose func.func @simple_unpack_dynamic_tile_transpose(%src: tensor<1x1x2x?xf32>, %dest: tensor<5x1xf32>, %tile_dim: index) -> tensor<5x1xf32> { - %0 = tensor.unpack %src inner_dims_pos = [1, 0] inner_tiles = [2, %tile_dim] into %dest : tensor<1x1x2x?xf32> -> tensor<5x1xf32> + %0 = linalg.unpack %src inner_dims_pos = [1, 0] inner_tiles = [2, %tile_dim] into %dest : tensor<1x1x2x?xf32> -> tensor<5x1xf32> return %0 : tensor<5x1xf32> } // CHECK-LABEL: func.func @simple_unpack_dynamic_tile_transpose @@ -78,7 +78,7 @@ func.func @simple_unpack_scalable_tile(%input: tensor<1x1x?x2xf32>, %output: ten %c8 = arith.constant 8 : index %vscale = vector.vscale %c8_vscale = arith.muli %vscale, %c8 : index - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 2] into %output : tensor<1x1x?x2xf32> -> tensor<5x1xf32> + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 2] into %output : tensor<1x1x?x2xf32> -> tensor<5x1xf32> return %0 : tensor<5x1xf32> } // CHECK-LABEL: func.func @simple_unpack_scalable_tile @@ -97,7 +97,7 @@ func.func @simple_unpack_scalable_tile(%input: tensor<1x1x?x2xf32>, %output: ten // ----- func.func @simple_CNnc_to_NC(%arg0: tensor<1x1x32x8xf32>, %arg1: tensor<32x8xf32>) -> tensor<32x8xf32>{ - %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<1x1x32x8xf32> -> tensor<32x8xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<1x1x32x8xf32> -> tensor<32x8xf32> return %0 : tensor<32x8xf32> } // CHECK-LABEL: func.func @simple_CNnc_to_NC @@ -112,7 +112,7 @@ func.func @simple_CNnc_to_NC(%arg0: tensor<1x1x32x8xf32>, %arg1: tensor<32x8xf32 // ----- func.func @simple_NCHWc_to_NCHW(%arg0: tensor<2x1x16x8x32xf32>, %arg1: tensor<2x32x16x8xf32>) -> tensor<2x32x16x8xf32> { - %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %arg1 : tensor<2x1x16x8x32xf32> -> tensor<2x32x16x8xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %arg1 : tensor<2x1x16x8x32xf32> -> tensor<2x32x16x8xf32> return %0 : tensor<2x32x16x8xf32> } // CHECK-LABEL: func.func @simple_NCHWc_to_NCHW @@ -131,7 +131,7 @@ func.func @simple_NCHWc_to_NCHW(%arg0: tensor<2x1x16x8x32xf32>, %arg1: tensor<2x // ----- func.func @simple_NHWC_to_NCHW(%arg0: tensor<1x16x8x32xf32>, %arg1: tensor<1x32x16x8xf32>) -> tensor<1x32x16x8xf32> { - %0 = tensor.unpack %arg0 outer_dims_perm = [0, 2, 3, 1] inner_dims_pos = [] inner_tiles = [] into %arg1 : tensor<1x16x8x32xf32> -> tensor<1x32x16x8xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [0, 2, 3, 1] inner_dims_pos = [] inner_tiles = [] into %arg1 : tensor<1x16x8x32xf32> -> tensor<1x32x16x8xf32> return %0 : tensor<1x32x16x8xf32> } // CHECK-LABEL: func.func @simple_NHWC_to_NCHW @@ -150,7 +150,7 @@ func.func @simple_NHWC_to_NCHW(%arg0: tensor<1x16x8x32xf32>, %arg1: tensor<1x32x // ----- func.func @unpack_with_dynamic_dims(%arg0: tensor, %arg1: tensor) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor -> tensor + %0 = linalg.unpack %arg0 inner_dims_pos = [3, 2] inner_tiles = [8, 32] into %arg1 : tensor -> tensor return %0 : tensor } // CHECK-LABEL: func.func @unpack_with_dynamic_dims diff --git a/mlir/test/Dialect/Linalg/fold-empty-op.mlir b/mlir/test/Dialect/Linalg/fold-empty-op.mlir new file mode 100644 index 0000000000000..5ce19d7091318 --- /dev/null +++ b/mlir/test/Dialect/Linalg/fold-empty-op.mlir @@ -0,0 +1,82 @@ +// RUN: mlir-opt -split-input-file -transform-interpreter %s | FileCheck %s + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) { + %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func"> + transform.apply_patterns to %func_op { + transform.apply_patterns.linalg.fold_pack_unpack_into_empty + } : !transform.op<"func.func"> + transform.yield + } +} + +func.func @pack_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> { + %empty_unpacked = tensor.empty() : tensor<256x256xf32> + %packed = linalg.pack %empty_unpacked + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32> + return %packed : tensor<8x8x32x32xf32> +} + +// CHECK-LABEL: func.func @pack_empty( +// CHECK-SAME: %[[T:.+]]: tensor<8x8x32x32xf32> +// CHECK-NOT: linalg.pack +// CHECK: return %[[T]] : tensor<8x8x32x32xf32> + +func.func @pack_empty_dynamic(%arg0: tensor, %dim0: index, %dim1: index) -> tensor { + %empty_unpacked = tensor.empty(%dim0, %dim1) : tensor + %packed = linalg.pack %empty_unpacked + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor -> tensor + return %packed : tensor +} + +// CHECK-LABEL: func.func @pack_empty_dynamic( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK-SAME: %[[DIM0:[a-zA-Z0-9_]+]]: index, +// CHECK-SAME: %[[DIM1:[a-zA-Z0-9_]+]]: index +// CHECK-NOT: linalg.pack +// CHECK: return %[[T]] : tensor + +func.func @unpack_empty(%arg0: tensor<256x256xf32>) -> tensor<256x256xf32> { + %empty_packed = tensor.empty() : tensor<8x8x32x32xf32> + %unpacked = linalg.unpack %empty_packed + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor<8x8x32x32xf32> -> tensor<256x256xf32> + return %unpacked : tensor<256x256xf32> +} + +// CHECK-LABEL: func.func @unpack_empty( +// CHECK-SAME: %[[T:.+]]: tensor<256x256xf32> +// CHECK-NOT: linalg.unpack +// CHECK: return %[[T]] : tensor<256x256xf32> + +func.func @unpack_empty_dynamic(%arg0: tensor, %dim0: index, %dim1: index) -> tensor { + %empty_packed = tensor.empty(%dim0, %dim1) : tensor + %unpacked = linalg.unpack %empty_packed + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor -> tensor + return %unpacked : tensor +} + +// CHECK-LABEL: func.func @unpack_empty_dynamic( +// CHECK-SAME: %[[T:.+]]: tensor, +// CHECK-SAME: %[[DIM0:[a-zA-Z0-9_]+]]: index, +// CHECK-SAME: %[[DIM1:[a-zA-Z0-9_]+]]: index +// CHECK-NOT: linalg.unpack +// CHECK: return %[[T]] : tensor + +func.func @pack_padded_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> { + %pad = arith.constant 1.0 : f32 + %empty_unpacked = tensor.empty() : tensor<256x256xf32> + %packed = linalg.pack %empty_unpacked + padding_value(%pad : f32) + inner_dims_pos = [0, 1] inner_tiles = [32, 32] + into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32> + return %packed : tensor<8x8x32x32xf32> +} + +// CHECK-LABEL: func.func @pack_padded_empty( +// CHECK-SAME: %[[T:.+]]: tensor<8x8x32x32xf32> +// CHECK: %[[PACK:.+]] = linalg.pack +// CHECK: return %[[PACK]] : tensor<8x8x32x32xf32> diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index cff741e75077e..f2283db8f89b2 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -1284,6 +1284,7 @@ func.func @indexing_map_size_one_batch_matmul(%arg0: memref, ins(%arg0, %arg1 : memref, memref) outs(%arg2: memref) return + } // ----- @@ -1459,3 +1460,187 @@ func.func @invalid_C_map_result_dim_batch_matmul(%arg0: memref, %arg1 outs(%arg2: memref) return } + + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.pack +//===----------------------------------------------------------------------===// + +func.func @pack_invalid_no_padding_no_full_tiles(%input: tensor<256x128xf32>, %output: tensor<8x8x16x33xf32>) -> tensor<8x8x16x33xf32> { + // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} + %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 33] into %output : tensor<256x128xf32> -> tensor<8x8x16x33xf32> + return %0 : tensor<8x8x16x33xf32> +} + +// ----- + +func.func @pack_invalid_no_padding_no_full_tiles_dyn_tiles(%input: tensor<256x128xf32>, %output: tensor<10x8x?x?xf32>, %tile_size_0: index, %tile_size_1: index) -> tensor<10x8x?x?xf32> { + // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} + %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [%tile_size_0, %tile_size_1] into %output : tensor<256x128xf32> -> tensor<10x8x?x?xf32> + return %0 : tensor<10x8x?x?xf32> +} + +// ----- + +func.func @pack_invalid_no_padding_no_full_tiles_dyn_tiles_outperm(%input: tensor<256x128xf32>, %output: tensor<8x10x?x?xf32>, %tile_size_0: index, %tile_size_1: index) -> tensor<8x10x?x?xf32> { + // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} + %0 = linalg.pack %input outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [%tile_size_0, %tile_size_1] into %output : tensor<256x128xf32> -> tensor<8x10x?x?xf32> + return %0 : tensor<8x10x?x?xf32> +} + +// ----- + +func.func @pad_and_pack_invalid_type(%input: tensor<13x15xf32>, %output: tensor<2x8x8x2xf32>, %pad: i32) -> tensor<2x8x8x2xf32> { + // expected-error@+1 {{expected padding_value has 'f32' but got: 'i32'}} + %0 = linalg.pack %input padding_value(%pad: i32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<13x15xf32> -> tensor<2x8x8x2xf32> + return %0 : tensor<2x8x8x2xf32> +} + +// ----- + +func.func @pack_invalid_inner_dims_pos_vector(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid inner_dims_pos vector}} + %0 = linalg.pack %input inner_dims_pos = [2, 0] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- + +func.func @pack_invalid_duplicate_element_in_inner_dims(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid inner_dims_pos vector}} + %0 = linalg.pack %input inner_dims_pos = [1, 1] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- + +func.func @pack_invalid_duplicate_element_in_outer_perm(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid outer_dims_perm vector}} + %0 = linalg.pack %input outer_dims_perm = [1, 1] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- + +func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<64x32x16xf32> { + // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} + %0 = linalg.pack %input inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %output : tensor<256x128xf32> -> tensor<64x32x16xf32> + return %0 : tensor<64x32x16xf32> +} + +// ----- + +func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid zero tile factor}} + %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [0, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- +func.func @pack_mismatch_inner_tile_size_and_output_shape( + %input : tensor, %output : tensor) -> tensor { + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @pack_dynamic_inner_tile_size_and_static_output_shape( + %input : tensor, %output : tensor) -> tensor { + %c8 = arith.constant 8 : index + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, %c8] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @pack_static_inner_tile_size_and_dynamic_output_shape( + %input : tensor, %output : tensor) -> tensor { + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @pack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<16x4x32x16xf32> { + // expected-error@+1 {{outer_dims_perm must be a permutation or empty}} + %0 = linalg.pack %source outer_dims_perm = [0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<16x4x32x16xf32> + return %0 : tensor<16x4x32x16xf32> +} + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.unpack +//===----------------------------------------------------------------------===// + +func.func @unpack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> { + // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} + %0 = linalg.unpack %output inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %input : tensor<64x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> +} + +// ----- + +func.func @unpack_invalid_out_of_bound_outer_perm(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{invalid outer_dims_perm vector}} + %0 = linalg.unpack %output outer_dims_perm = [2, 1] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %input : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> +} + +// ----- + +func.func @unpack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<128x256xf32> { + // expected-error@+1 {{outer_dims_perm must be a permutation or empty}} + %0 = linalg.unpack %dest outer_dims_perm = [1] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<16x4x32x16xf32> -> tensor<128x256xf32> + return %0 : tensor<128x256xf32> +} + +// ----- + +func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { + // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x8x16x32xf32>', got 'tensor<8x8x32x16xf32>'}} + %0 = linalg.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> + return %0 : tensor<8x8x32x16xf32> +} + +// ----- + +func.func @unpack_invalid(%output: tensor<256x128xf32>, %input: tensor<8x8x32x16xf32>) -> tensor<256x128xf32> { + // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x32x4x32xf32>', got 'tensor<8x8x32x16xf32>'}} + %0 = linalg.unpack %input inner_dims_pos = [1, 0] inner_tiles = [4, 32] into %output : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> +} + +// ----- + +func.func @unpack_mismatch_inner_tile_size_and_output_shape( + %input : tensor, %output : tensor) -> tensor { + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @unpack_dynamic_inner_tile_size_and_static_output_shape( + %input : tensor, %output : tensor) -> tensor { + %c8 = arith.constant 8 : index + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%c8, 4] into %output : tensor -> tensor + return %0 : tensor +} + +// ----- + +func.func @unpack_static_inner_tile_size_and_dynamic_output_shape( + %input : tensor, %output : tensor) -> tensor { + // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} + %0 = linalg.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor + return %0 : tensor +} diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 68ea97be911a6..8474eeac0db5b 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -2520,3 +2520,108 @@ func.func @select_tensor(%arg0: tensor<4x8x16xi1>, %arg1: tensor<4x8x16xf32>, %a %1 = linalg.select ins(%arg0, %arg1, %arg2 : tensor<4x8x16xi1>, tensor<4x8x16xf32>, tensor<4x8x16xf32>) outs(%0: tensor<4x8x16xf32>) -> tensor<4x8x16xf32> return %1 : tensor<4x8x16xf32> } + +//===----------------------------------------------------------------------===// +// linalg.pack + linalg.unpack +//===----------------------------------------------------------------------===// + +func.func @pack_nc_to_ncnc(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) -> tensor<128x256xf32> { + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %1 = tensor.empty() : tensor<128x256xf32> + %2 = linalg.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<4x16x32x16xf32> -> tensor<128x256xf32> + return %2 : tensor<128x256xf32> +} + +// CHECK-LABEL: func.func @pack_nc_to_ncnc( +// CHECK-SAME: %[[SOURCE:.*]]: tensor<128x256xf32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<4x16x32x16xf32>) +// CHECK: %[[PACKED:.*]] = linalg.pack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[DEST]] : tensor<128x256xf32> -> tensor<4x16x32x16xf32> +// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<128x256xf32> +// CHECK: %{{.*}} = linalg.unpack %[[PACKED]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[BUFF]] : tensor<4x16x32x16xf32> -> tensor<128x256xf32> + +// ----- + +func.func @pack_nc_to_ncnc_with_padding(%source: tensor<13x15xf32>, %dest: tensor<2x8x8x2xf32>, %padding: f32) -> tensor<13x15xf32> { + %0 = linalg.pack %source padding_value(%padding : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor<13x15xf32> -> tensor<2x8x8x2xf32> + %1 = tensor.empty() : tensor<13x15xf32> + %2 = linalg.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %1 : tensor<2x8x8x2xf32> -> tensor<13x15xf32> + return %2 : tensor<13x15xf32> +} + +// CHECK-LABEL: func.func @pack_nc_to_ncnc_with_padding( +// CHECK-SAME: %[[SOURCE:.*]]: tensor<13x15xf32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<2x8x8x2xf32>, +// CHECK-SAME: %[[PADDING:.*]]: f32) +// CHECK: %[[PACKED:.*]] = linalg.pack %[[SOURCE]] padding_value(%[[PADDING]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor<13x15xf32> -> tensor<2x8x8x2xf32> +// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<13x15xf32> +// CHECK: %{{.*}} = linalg.unpack %[[PACKED]] inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[BUFF]] : tensor<2x8x8x2xf32> -> tensor<13x15xf32> + +// ----- + +func.func @pack_ck_to_kcck(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<128x256xf32> { + %0 = linalg.pack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<16x4x32x16xf32> + %1 = tensor.empty() : tensor<128x256xf32> + %2 = linalg.unpack %0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<16x4x32x16xf32> -> tensor<128x256xf32> + return %2 : tensor<128x256xf32> +} + +// CHECK-LABEL: func.func @pack_ck_to_kcck( +// CHECK-SAME: %[[SOURCE:.*]]: tensor<128x256xf32>, +// CHECK-SAME: %[[DEST:.*]]: tensor<16x4x32x16xf32>) +// CHECK: %[[PACKED:.*]] = linalg.pack %[[SOURCE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[DEST]] : tensor<128x256xf32> -> tensor<16x4x32x16xf32> +// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<128x256xf32> +// CHECK: %{{.*}} = linalg.unpack %[[PACKED]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[BUFF]] : tensor<16x4x32x16xf32> -> tensor<128x256xf32> + +// ----- + +func.func @pad_and_pack_fully_dynamic(%source: tensor, %dest: tensor, %pad: f32, %tile_n : index, %tile_m : index) -> tensor { + %0 = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func.func @pad_and_pack_fully_dynamic( +// CHECK-SAME: %[[SOURCE:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor, +// CHECK-SAME: %[[PAD:.*]]: f32, +// CHECK-SAME: %[[TILE_N:.*]]: index, +// CHECK-SAME: %[[TILE_M:.*]]: index) +// CHECK: %{{.*}} = linalg.pack %[[SOURCE]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_N]], %[[TILE_M]]] into %[[DEST]] : tensor -> tensor + +// ----- + +func.func @pad_and_pack_partially_dynamic(%source: tensor, %dest: tensor, %pad: f32) -> tensor { + %0 = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func.func @pad_and_pack_partially_dynamic( +// CHECK-SAME: %[[SOURCE:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor, +// CHECK-SAME: %[[PAD:.*]]: f32) +// CHECK: %{{.*}} = linalg.pack %[[SOURCE]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor -> tensor + +// ----- + +func.func @unpack_fully_dynamic(%source: tensor, %dest: tensor, %tile_n : index, %tile_m : index) -> tensor { + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor + return %0 : tensor +} + +// CHECK-LABEL: func.func @unpack_fully_dynamic( +// CHECK-SAME: %[[SOURCE:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor, +// CHECK-SAME: %[[TILE_N:.*]]: index, +// CHECK-SAME: %[[TILE_M:.*]]: index) +// CHECK: %{{.*}} = linalg.unpack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_N]], %[[TILE_M]]] into %[[DEST]] : tensor -> tensor + +// ----- + +func.func @unpack_partially_dynamic(%source: tensor, %dest: tensor) -> tensor { + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor + return %0: tensor +} + +// CHECK-LABEL: func.func @unpack_partially_dynamic( +// CHECK-SAME: %[[SOURCE:.*]]: tensor, +// CHECK-SAME: %[[DEST:.*]]: tensor) +// CHECK: %{{.*}} = linalg.unpack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor -> tensor diff --git a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir similarity index 86% rename from mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir rename to mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir index f9e51ae52a74b..51350e5bc8498 100644 --- a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir +++ b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -split-input-file -test-tensor-transform-patterns="test-simplify-pack-unpack-patterns" %s | FileCheck %s +// RUN: mlir-opt -split-input-file -test-linalg-transform-patterns="test-simplify-pack-unpack-patterns" %s | FileCheck %s // CHECK-LABEL: func.func @single_dim_packing( // CHECK-SAME: %[[ARG0:.+]]: tensor<256xf32>) @@ -6,7 +6,7 @@ // CHECK: return %[[EXPANDED]] : tensor<8x32xf32> func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> { %empty = tensor.empty() : tensor<8x32xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256xf32> -> tensor<8x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256xf32> -> tensor<8x32xf32> return %0 : tensor<8x32xf32> } @@ -15,11 +15,11 @@ func.func @single_dim_packing(%arg0: tensor<256xf32>) -> tensor<8x32xf32> { // CHECK-LABEL: func.func @single_dim_packing_with_padding( // CHECK-SAME: %[[ARG0:.+]]: tensor<255xf32>) // CHECK-NOT: tensor.expand_shape -// CHECK: tensor.pack +// CHECK: linalg.pack func.func @single_dim_packing_with_padding(%arg0: tensor<255xf32>) -> tensor<8x32xf32> { %empty = tensor.empty() : tensor<8x32xf32> %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.pack %arg0 padding_value(%cst : f32) inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<255xf32> -> tensor<8x32xf32> + %0 = linalg.pack %arg0 padding_value(%cst : f32) inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<255xf32> -> tensor<8x32xf32> return %0 : tensor<8x32xf32> } @@ -31,7 +31,7 @@ func.func @single_dim_packing_with_padding(%arg0: tensor<255xf32>) -> tensor<8x3 // CHECK: return %[[EXPANDED]] : tensor<5x8x32xf32> func.func @single_last_inner_dim_packing(%arg0: tensor<5x256xf32>) -> tensor<5x8x32xf32> { %empty = tensor.empty() : tensor<5x8x32xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32> return %0 : tensor<5x8x32xf32> } @@ -43,7 +43,7 @@ func.func @single_last_inner_dim_packing(%arg0: tensor<5x256xf32>) -> tensor<5x8 // CHECK: return %[[EXPANDED]] : tensor<2x32xf32> func.func @pack_1d_with_outer_dims_perm(%arg0: tensor<64xf32>) -> tensor<2x32xf32> { %empty = tensor.empty() : tensor<2x32xf32> - %pack = tensor.pack %arg0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<64xf32> -> tensor<2x32xf32> + %pack = linalg.pack %arg0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<64xf32> -> tensor<2x32xf32> return %pack : tensor<2x32xf32> } @@ -55,7 +55,7 @@ func.func @pack_1d_with_outer_dims_perm(%arg0: tensor<64xf32>) -> tensor<2x32xf3 // CHECK: return %[[EXPANDED]] : tensor<5x8x32xf32> func.func @single_last_inner_dim_packing_with_identity_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<5x8x32xf32> { %empty = tensor.empty() : tensor<5x8x32xf32> - %0 = tensor.pack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<5x8x32xf32> return %0 : tensor<5x8x32xf32> } @@ -63,10 +63,10 @@ func.func @single_last_inner_dim_packing_with_identity_outer_dims_perm(%arg0: te // CHECK-LABEL: func.func @packing_with_outer_dims_perm( // CHECK-NOT: tensor.expand_shape -// CHECK: tensor.pack +// CHECK: linalg.pack func.func @packing_with_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<8x5x32xf32> { %empty = tensor.empty() : tensor<8x5x32xf32> - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<8x5x32xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x256xf32> -> tensor<8x5x32xf32> return %0 : tensor<8x5x32xf32> } @@ -74,10 +74,10 @@ func.func @packing_with_outer_dims_perm(%arg0: tensor<5x256xf32>) -> tensor<8x5x // CHECK-LABEL: func.func @single_first_inner_dim_packing( // CHECK-NOT: tensor.expand_shape -// CHECK: tensor.pack +// CHECK: linalg.pack func.func @single_first_inner_dim_packing(%arg0: tensor<256x5xf32>) -> tensor<8x5x32xf32> { %empty = tensor.empty() : tensor<8x5x32xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256x5xf32> -> tensor<8x5x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<256x5xf32> -> tensor<8x5x32xf32> return %0 : tensor<8x5x32xf32> } @@ -89,7 +89,7 @@ func.func @single_first_inner_dim_packing(%arg0: tensor<256x5xf32>) -> tensor<8x // CHECK: return %[[EXPANDED]] func.func @pack_1x32_to_1x32x1x1(%arg0 : tensor<1x32xf32>) -> tensor<1x32x1x1xf32> { %empty = tensor.empty() : tensor<1x32x1x1xf32> - %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %empty + %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %empty : tensor<1x32xf32> -> tensor<1x32x1x1xf32> return %pack : tensor<1x32x1x1xf32> } @@ -102,7 +102,7 @@ func.func @pack_1x32_to_1x32x1x1(%arg0 : tensor<1x32xf32>) -> tensor<1x32x1x1xf3 // CHECK: return %[[EXPANDED]] func.func @pack_1x32_to_1x16x1x2(%arg0 : tensor<1x32xf32>) -> tensor<1x16x1x2xf32> { %empty = tensor.empty() : tensor<1x16x1x2xf32> - %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 2] into %empty + %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 2] into %empty : tensor<1x32xf32> -> tensor<1x16x1x2xf32> return %pack : tensor<1x16x1x2xf32> } @@ -115,7 +115,7 @@ func.func @pack_1x32_to_1x16x1x2(%arg0 : tensor<1x32xf32>) -> tensor<1x16x1x2xf3 // CHECK: return %[[EXPANDED]] func.func @pack_32x1_to_16x1x2x1(%arg0 : tensor<32x1xf32>) -> tensor<1x16x2x1xf32> { %empty = tensor.empty() : tensor<1x16x2x1xf32> - %pack = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 1] into %empty + %pack = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 1] into %empty : tensor<32x1xf32> -> tensor<1x16x2x1xf32> return %pack : tensor<1x16x2x1xf32> } @@ -124,10 +124,10 @@ func.func @pack_32x1_to_16x1x2x1(%arg0 : tensor<32x1xf32>) -> tensor<1x16x2x1xf3 // CHECK-LABEL: func.func @pack_32x1_to_16x1x1x2 // CHECK-NOT: tensor.expand_shape -// CHECK: tensor.pack +// CHECK: linalg.pack func.func @pack_32x1_to_16x1x1x2(%arg0 : tensor<32x1xf32>) -> tensor<16x1x1x2xf32> { %empty = tensor.empty() : tensor<16x1x1x2xf32> - %pack = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [1, 2] into %empty + %pack = linalg.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [1, 2] into %empty : tensor<32x1xf32> -> tensor<16x1x1x2xf32> return %pack : tensor<16x1x1x2xf32> } @@ -140,7 +140,7 @@ func.func @pack_32x1_to_16x1x1x2(%arg0 : tensor<32x1xf32>) -> tensor<16x1x1x2xf3 // CHECK: return %[[COLLAPSED]] func.func @unpack_1d_to_collapse(%arg0: tensor<8x32xf32>) -> tensor<256xf32> { %empty = tensor.empty() : tensor<256xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<256xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<256xf32> return %0 : tensor<256xf32> } @@ -148,10 +148,10 @@ func.func @unpack_1d_to_collapse(%arg0: tensor<8x32xf32>) -> tensor<256xf32> { // CHECK-LABEL: func.func @unpack_to_partial_slice // CHECK-NOT: tensor.collapse -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> { %empty = tensor.empty() : tensor<255xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<255xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x32xf32> -> tensor<255xf32> return %0 : tensor<255xf32> } @@ -159,14 +159,14 @@ func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> { // CHECK-LABEL: func.func @unpack_dynamic // CHECK-NOT: tensor.collapse -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @unpack_dynamic(%arg0: tensor) -> tensor { %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index %d0 = tensor.dim %arg0, %c0 : tensor %size = arith.muli %d0, %c32 : index %empty = tensor.empty(%size) : tensor - %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor -> tensor + %0 = linalg.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor -> tensor return %0 : tensor } @@ -178,7 +178,7 @@ func.func @unpack_dynamic(%arg0: tensor) -> tensor { // CHECK: return %[[COLLAPSED]] : tensor<5x256xf32> func.func @single_last_inner_dim_unpacking(%arg0: tensor<5x8x32xf32>) -> tensor<5x256xf32> { %empty = tensor.empty() : tensor<5x256xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32> return %0 : tensor<5x256xf32> } @@ -190,7 +190,7 @@ func.func @single_last_inner_dim_unpacking(%arg0: tensor<5x8x32xf32>) -> tensor< // CHECK: return %[[COLLAPSED]] : tensor<5x256xf32> func.func @single_last_inner_dim_unpacking_with_identity_outer_dims_perm(%arg0: tensor<5x8x32xf32>) -> tensor<5x256xf32> { %empty = tensor.empty() : tensor<5x256xf32> - %0 = tensor.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<5x8x32xf32> -> tensor<5x256xf32> return %0 : tensor<5x256xf32> } @@ -198,10 +198,10 @@ func.func @single_last_inner_dim_unpacking_with_identity_outer_dims_perm(%arg0: // CHECK-LABEL: func.func @unpacking_with_outer_dims_perm( // CHECK-NOT: tensor.collpase_shape -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @unpacking_with_outer_dims_perm(%arg0: tensor<8x5x32xf32>) -> tensor<5x256xf32> { %empty = tensor.empty() : tensor<5x256xf32> - %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<5x256xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [1] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<5x256xf32> return %0 : tensor<5x256xf32> } @@ -209,10 +209,10 @@ func.func @unpacking_with_outer_dims_perm(%arg0: tensor<8x5x32xf32>) -> tensor<5 // CHECK-LABEL: func.func @single_first_inner_dim_unpacking( // CHECK-NOT: tensor.collapse_shape -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @single_first_inner_dim_unpacking(%arg0: tensor<8x5x32xf32>) -> tensor<256x5xf32> { %empty = tensor.empty() : tensor<256x5xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<256x5xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0] inner_tiles = [32] into %empty : tensor<8x5x32xf32> -> tensor<256x5xf32> return %0 : tensor<256x5xf32> } @@ -224,7 +224,7 @@ func.func @single_first_inner_dim_unpacking(%arg0: tensor<8x5x32xf32>) -> tensor // CHECK: return %[[COLLAPSED]] func.func @unpack_1x32x1x1_to_1x32(%arg0 : tensor<1x32x1x1xf32>) -> tensor<1x32xf32> { %empty = tensor.empty() : tensor<1x32xf32> - %unpack = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %empty + %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [1, 1] into %empty : tensor<1x32x1x1xf32> -> tensor<1x32xf32> return %unpack : tensor<1x32xf32> } @@ -237,7 +237,7 @@ func.func @unpack_1x32x1x1_to_1x32(%arg0 : tensor<1x32x1x1xf32>) -> tensor<1x32x // CHECK: return %[[COLLAPSED]] func.func @unpack_1x2x1x16_to_1x32(%arg0 : tensor<1x2x1x16xf32>) -> tensor<1x32xf32> { %empty = tensor.empty() : tensor<1x32xf32> - %unpack = tensor.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 16] into %empty + %unpack = linalg.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 16] into %empty : tensor<1x2x1x16xf32> -> tensor<1x32xf32> return %unpack : tensor<1x32xf32> } @@ -250,7 +250,7 @@ func.func @unpack_1x2x1x16_to_1x32(%arg0 : tensor<1x2x1x16xf32>) -> tensor<1x32x // CHECK: return %[[COLLAPSED]] func.func @unpack_16x1x2x1_to_32x1(%arg0 : tensor<1x16x2x1xf32>) -> tensor<32x1xf32> { %empty = tensor.empty() : tensor<32x1xf32> - %unpack = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 1] into %empty + %unpack = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 1] into %empty : tensor<1x16x2x1xf32> -> tensor<32x1xf32> return %unpack : tensor<32x1xf32> } @@ -259,10 +259,10 @@ func.func @unpack_16x1x2x1_to_32x1(%arg0 : tensor<1x16x2x1xf32>) -> tensor<32x1x // CHECK-LABEL: func.func @unpack_16x1x1x2_to_32x1 // CHECK-NOT: tensor.collapse_shape -// CHECK: tensor.unpack +// CHECK: linalg.unpack func.func @unpack_16x1x1x2_to_32x1(%arg0 : tensor<16x1x1x2xf32>) -> tensor<32x1xf32> { %empty = tensor.empty() : tensor<32x1xf32> - %unpack = tensor.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [1, 2] into %empty + %unpack = linalg.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [1, 2] into %empty : tensor<16x1x1x2xf32> -> tensor<32x1xf32> return %unpack : tensor<32x1xf32> } @@ -275,7 +275,7 @@ func.func @unpack_16x1x1x2_to_32x1(%arg0 : tensor<16x1x1x2xf32>) -> tensor<32x1x // CHECK: return %[[EXPANDED]] : tensor<1x1x32x64xf32> func.func @pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> { %empty = tensor.empty() : tensor<1x1x32x64xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32> return %0 : tensor<1x1x32x64xf32> } @@ -287,7 +287,7 @@ func.func @pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> { // CHECK: return %[[EXPANDED]] : tensor<1x1x32x64xf32> func.func @pad_like_pack_with_outer_dims_perm(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> { %empty = tensor.empty() : tensor<1x1x32x64xf32> - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32> + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32> return %0 : tensor<1x1x32x64xf32> } @@ -299,7 +299,7 @@ func.func @pad_like_pack_with_outer_dims_perm(%arg0: tensor<32x64xf32>) -> tenso // CHECK: return %[[EXPANDED]] : tensor<32x1x64xf32> func.func @inner_pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<32x1x64xf32> { %empty = tensor.empty() : tensor<32x1x64xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64xf32> -> tensor<32x1x64xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64xf32> -> tensor<32x1x64xf32> return %0 : tensor<32x1x64xf32> } @@ -309,11 +309,11 @@ func.func @inner_pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<32x1x64xf32> // CHECK-LABEL: func.func @pad_and_inner_dim_shuffle_pack( // CHECK-SAME: %[[ARG0:.+]]: tensor<32x64xf32>) // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<1x1x64x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %[[EMPTY]] : tensor<32x64xf32> -> tensor<1x1x64x32xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %[[EMPTY]] : tensor<32x64xf32> -> tensor<1x1x64x32xf32> // CHECK: return %[[PACK]] : tensor<1x1x64x32xf32> func.func @pad_and_inner_dim_shuffle_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x64x32xf32> { %empty = tensor.empty() : tensor<1x1x64x32xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %empty : tensor<32x64xf32> -> tensor<1x1x64x32xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %empty : tensor<32x64xf32> -> tensor<1x1x64x32xf32> return %0 : tensor<1x1x64x32xf32> } @@ -323,11 +323,11 @@ func.func @pad_and_inner_dim_shuffle_pack(%arg0: tensor<32x64xf32>) -> tensor<1x // CHECK-LABEL: func.func @pad_like_pack_with_transpose( // CHECK-SAME: %[[ARG0:.+]]: tensor<32x64x16xf32>) // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x1x16x64xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32> +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32> // CHECK: return %[[PACK]] : tensor<32x1x16x64xf32> func.func @pad_like_pack_with_transpose(%arg0: tensor<32x64x16xf32>) -> tensor<32x1x16x64xf32> { %empty = tensor.empty() : tensor<32x1x16x64xf32> - %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32> + %0 = linalg.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32> return %0 : tensor<32x1x16x64xf32> } @@ -339,7 +339,7 @@ func.func @pad_like_pack_with_transpose(%arg0: tensor<32x64x16xf32>) -> tensor<3 // CHECK: return %[[COLLAPSED]] : tensor<32x64xf32> func.func @unpad_like_unpack(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> { %empty = tensor.empty() : tensor<32x64xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32> return %0 : tensor<32x64xf32> } @@ -351,7 +351,7 @@ func.func @unpad_like_unpack(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> // CHECK: return %[[COLLAPSED]] : tensor<32x64xf32> func.func @unpad_like_unpack_with_outer_dims_perm(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> { %empty = tensor.empty() : tensor<32x64xf32> - %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32> + %0 = linalg.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32> return %0 : tensor<32x64xf32> } @@ -363,7 +363,7 @@ func.func @unpad_like_unpack_with_outer_dims_perm(%arg0: tensor<1x1x32x64xf32>) // CHECK: return %[[COLLAPSED]] : tensor<32x64xf32> func.func @inner_unpad_like_unpack(%arg0: tensor<32x1x64xf32>) -> tensor<32x64xf32> { %empty = tensor.empty() : tensor<32x64xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x64xf32> -> tensor<32x64xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x64xf32> -> tensor<32x64xf32> return %0 : tensor<32x64xf32> } @@ -373,11 +373,11 @@ func.func @inner_unpad_like_unpack(%arg0: tensor<32x1x64xf32>) -> tensor<32x64xf // CHECK-LABEL: func.func @unpad_and_inner_dim_shuffle_pack( // CHECK-SAME: %[[ARG0:.+]]: tensor<1x1x32x64xf32>) // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<64x32xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %[[EMPTY]] : tensor<1x1x32x64xf32> -> tensor<64x32xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %[[EMPTY]] : tensor<1x1x32x64xf32> -> tensor<64x32xf32> // CHECK: return %[[UNPACK]] : tensor<64x32xf32> func.func @unpad_and_inner_dim_shuffle_pack(%arg0: tensor<1x1x32x64xf32>) -> tensor<64x32xf32> { %empty = tensor.empty() : tensor<64x32xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<64x32xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<64x32xf32> return %0 : tensor<64x32xf32> } @@ -387,10 +387,10 @@ func.func @unpad_and_inner_dim_shuffle_pack(%arg0: tensor<1x1x32x64xf32>) -> ten // CHECK-LABEL: func.func @unpad_like_unpack_with_transpose( // CHECK-SAME: %[[ARG0:.+]]: tensor<32x1x16x64xf32>) // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32x64x16xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32> // CHECK: return %[[UNPACK]] : tensor<32x64x16xf32> func.func @unpad_like_unpack_with_transpose(%arg0: tensor<32x1x16x64xf32>) -> tensor<32x64x16xf32> { %empty = tensor.empty() : tensor<32x64x16xf32> - %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32> + %0 = linalg.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32> return %0 : tensor<32x64x16xf32> } diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir index 542a7ed4a198b..357f2c11a7936 100644 --- a/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir +++ b/mlir/test/Dialect/Linalg/specialize-generic-ops-fail.mlir @@ -6,11 +6,26 @@ // CHECK-LABEL: @transpose_and_broadcast // CHECK: linalg.generic func.func @transpose_and_broadcast(%arg0: tensor<7x8xf32>, %arg1: tensor<8x7x9xf32>) -> tensor<8x7x9xf32> { - %0 = linalg.generic - {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} - ins(%arg0 : tensor<7x8xf32>) outs(%arg1 : tensor<8x7x9xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 + %res = linalg.generic { + indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"] + } ins(%arg0 : tensor<7x8xf32>) outs(%arg1 : tensor<8x7x9xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 } -> tensor<8x7x9xf32> - return %0 : tensor<8x7x9xf32> + return %res : tensor<8x7x9xf32> +} + +// ----- + +#map = affine_map<(d0) -> (d0)> +// CHECK-LABEL: @neither_permutation_nor_broadcast +// CHECK: linalg.generic +func.func @neither_permutation_nor_broadcast(%init : tensor<8xi32>) -> tensor<8xi32> { + %res = linalg.generic { + indexing_maps = [#map], iterator_types = ["parallel"] + } outs(%init: tensor<8xi32>) { + ^bb0(%out: i32): + linalg.yield %out: i32 + } -> tensor<8xi32> + return %res : tensor<8xi32> } diff --git a/mlir/test/Dialect/Linalg/td/decompose-pack.mlir b/mlir/test/Dialect/Linalg/td/decompose-pack.mlir index 49c45e29d5a14..32054134266c7 100644 --- a/mlir/test/Dialect/Linalg/td/decompose-pack.mlir +++ b/mlir/test/Dialect/Linalg/td/decompose-pack.mlir @@ -1,6 +1,6 @@ module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @decompose_pack(%module: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %pack {isolated_from_above} : (!transform.any_op) -> !transform.any_op transform.apply_patterns to %1 { diff --git a/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir index 11243634262e0..f5b8403af5e58 100644 --- a/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir +++ b/mlir/test/Dialect/Linalg/td/decompose-unpack.mlir @@ -1,6 +1,6 @@ module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @decompose_unpack(%module: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.unpack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.unpack"]} in %module : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %pack {isolated_from_above} : (!transform.any_op) -> !transform.any_op transform.apply_patterns to %1 { diff --git a/mlir/test/Dialect/Linalg/transform-lower-pack.mlir b/mlir/test/Dialect/Linalg/transform-lower-pack.mlir index 5f8ff36a16578..81fd7a8a947d7 100644 --- a/mlir/test/Dialect/Linalg/transform-lower-pack.mlir +++ b/mlir/test/Dialect/Linalg/transform-lower-pack.mlir @@ -4,7 +4,7 @@ func.func @pack(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<17x2x16x16x32x8xf32>) -> tensor<17x2x16x16x32x8xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose + // linalg.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose // CHECK: tensor.pad {{.*}} low[0, 0, 0, 0] // CHECK: : tensor<129x47x16x16xf32> to tensor<136x64x16x16xf32> // CHECK: tensor.expand_shape %{{.*}} [{{.*}}[0, 1], [2, 3], [4], [5]] @@ -13,16 +13,16 @@ func.func @pack(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<17x2x16x16x32x8xf // CHECK-SAME: ins(%{{.*}} : tensor<17x8x2x32x16x16xf32>) // CHECK-SAME: outs(%{{.*}} : tensor<17x2x16x16x32x8xf32>) // CHECK-SAME: permutation = [0, 2, 4, 5, 3, 1] - %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 : tensor<129x47x16x16xf32> -> tensor<17x2x16x16x32x8xf32> return %pack : tensor<17x2x16x16x32x8xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -33,7 +33,7 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @pack( func.func @pack(%arg0: tensor<128x8xf32>, %arg1: tensor<8x8x16x1xf32>) -> tensor<8x8x16x1xf32> { - // tensor.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose + // linalg.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose // CHECK: tensor.pad {{.*}} low[0, 0] // CHECK: : tensor<128x8xf32> to tensor<128x8xf32> // CHECK: tensor.expand_shape %{{.*}} [{{.*}}[0, 1], [2, 3]] @@ -43,7 +43,7 @@ func.func @pack(%arg0: tensor<128x8xf32>, %arg1: tensor<8x8x16x1xf32>) -> tensor // CHECK-SAME: outs(%{{.*}} : tensor<8x8x16x1xf32>) // CHECK-SAME: permutation = [0, 2, 1, 3] - %pack = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %arg1 + %pack = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %arg1 : tensor<128x8xf32> -> tensor<8x8x16x1xf32> return %pack : tensor<8x8x16x1xf32> @@ -51,9 +51,9 @@ func.func @pack(%arg0: tensor<128x8xf32>, %arg1: tensor<8x8x16x1xf32>) -> tensor module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -67,7 +67,7 @@ module attributes {transform.with_named_sequence} { func.func @pack_as_pad(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x136x64x16x16xf32>) -> tensor<1x1x1x1x136x64x16x16xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.pack is lowered to tensor.pad + tensor.insert_slice + // linalg.pack is lowered to tensor.pad + tensor.insert_slice // CHECK: %[[PAD:.*]] = tensor.pad %[[SRC]] low[0, 0, 0, 0] high[7, 17, 0, 0] // CHECK: : tensor<129x47x16x16xf32> to tensor<136x64x16x16xf32> // CHECK: %[[RES:.*]] = tensor.insert_slice %[[PAD]] into %[[OUT]] @@ -79,16 +79,16 @@ func.func @pack_as_pad(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x13 // CHECK-SAME: [1, 1, 1, 1, 1, 1, 1, 1] // CHECK-SAME: : tensor<136x64x16x16xf32> into tensor<1x1x1x1x136x64x16x16xf32> // CHECK: return %[[RES]] - %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<129x47x16x16xf32> -> tensor<1x1x1x1x136x64x16x16xf32> return %pack : tensor<1x1x1x1x136x64x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -101,22 +101,22 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @pack_as_pad_disabled_insert_slice( func.func @pack_as_pad_disabled_insert_slice(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x136x64x16x16xf32>) -> tensor<1x1x1x1x136x64x16x16xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose + // linalg.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose // CHECK-SAME: %[[ARG0:[^:]*]]: tensor<129x47x16x16xf32> // CHECK-DAG: %[[PAD:.*]] = tensor.pad %[[ARG0]] // CHECK-NOT: %[[RES:.*]] = tensor.insert_slice %[[PAD]] // CHECK: %[[PAD_EXPANDED:.*]] = tensor.expand_shape %[[PAD]] // CHECK-DAG: %[[RES:.*]] = linalg.transpose ins(%[[PAD_EXPANDED]] - %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<129x47x16x16xf32> -> tensor<1x1x1x1x136x64x16x16xf32> return %pack : tensor<1x1x1x1x136x64x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack {lowerPadLikeWithInsertSlice = false}: (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack {lowerPadLikeWithInsertSlice = false}: (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -141,16 +141,16 @@ func.func @pack_not_a_pad(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x16x // CHECK-SAME: outs(%{{.*}} : tensor<1x1x16x16x136x64xf32>) // CHECK-SAME: permutation = [0, 2, 4, 5, 1, 3] - %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1] inner_tiles = [136, 64] into %arg1 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1] inner_tiles = [136, 64] into %arg1 : tensor<129x47x16x16xf32> -> tensor<1x1x16x16x136x64xf32> return %pack : tensor<1x1x16x16x136x64xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -172,16 +172,16 @@ func.func @unpack(%arg0: tensor<17x2x16x16x32x8xf32>, %arg1: tensor<129x47x16x16 // CHECK-SAME: : tensor<136x64x16x16xf32> to tensor<129x47x16x16xf32> // CHECK: linalg.copy ins(%[[SLICE]] : tensor<129x47x16x16xf32>) // CHECK-SAME: outs(%[[ARG1]] : tensor<129x47x16x16xf32>) - %unpack = tensor.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 + %unpack = linalg.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 : tensor<17x2x16x16x32x8xf32> -> tensor<129x47x16x16xf32> return %unpack : tensor<129x47x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -207,16 +207,16 @@ func.func @unpack_with_identity_outer_dims_perm(%arg0: tensor<17x2x16x16x32x8xf3 // CHECK-SAME: : tensor<136x64x16x16xf32> to tensor<129x47x16x16xf32> // CHECK: linalg.copy ins(%[[SLICE]] : tensor<129x47x16x16xf32>) // CHECK-SAME: outs(%[[ARG1]] : tensor<129x47x16x16xf32>) - %unpack = tensor.unpack %arg0 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 + %unpack = linalg.unpack %arg0 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg1 : tensor<17x2x16x16x32x8xf32> -> tensor<129x47x16x16xf32> return %unpack : tensor<129x47x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -241,16 +241,16 @@ func.func @unpack_as_pad(%arg0: tensor<1x1x1x1x136x64x16x16xf32>, %arg1: tensor< // strides multiplers. // CHECK-SAME: [1, 1, 1, 1, 1, 1, 1, 1] // CHECK-SAME: : tensor<1x1x1x1x136x64x16x16xf32> to tensor<129x47x16x16xf32> - %pack = tensor.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<1x1x1x1x136x64x16x16xf32> -> tensor<129x47x16x16xf32> return %pack : tensor<129x47x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -267,22 +267,22 @@ module attributes {transform.with_named_sequence} { func.func @unpack_as_pad_disabled_extract_slice(%arg0: tensor<1x1x1x1x136x64x16x16xf32>, %arg1: tensor<129x47x16x16xf32>) -> tensor<129x47x16x16xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.unpack is lowered to tensor.extract_slice + linalg.transpose + tensor.collapse_shape + // linalg.unpack is lowered to tensor.extract_slice + linalg.transpose + tensor.collapse_shape // CHECK-DAG: %[[ARG0:[^:]*]]: tensor<1x1x1x1x136x64x16x16xf32> // CHECK-NOT: %[[RES:.*]] = tensor.extract_slice %[[ARG0]] // CHECK: %[[TRANSPOSED:.*]] = linalg.transpose ins(%[[ARG0]] // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[TRANSPOSED]] // CHECK-DAG: %[[RES:.*]] = tensor.extract_slice %[[COLLAPSED]] - %pack = tensor.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<1x1x1x1x136x64x16x16xf32> -> tensor<129x47x16x16xf32> return %pack : tensor<129x47x16x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack {lowerUnpadLikeWithExtractSlice = false}: (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack {lowerUnpadLikeWithExtractSlice = false}: (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -305,7 +305,7 @@ func.func @pack_with_outer_dims_perm(%src: tensor<100x200x128x256xi32>, // CHECK-SAME: ins(%{{.*}} : tensor<100x200x4x32x16x16xi32>) // CHECK-SAME: outs(%{{.*}} : tensor<200x4x16x100x16x32xi32>) // CHECK-SAME: permutation = [1, 2, 4, 0, 5, 3] - %0 = tensor.pack %src + %0 = linalg.pack %src outer_dims_perm = [1, 2, 3, 0] inner_dims_pos = [3, 2] inner_tiles = [16, 32] @@ -315,9 +315,9 @@ func.func @pack_with_outer_dims_perm(%src: tensor<100x200x128x256xi32>, module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -337,7 +337,7 @@ func.func @pack_with_pad(%src: tensor<4225x12xf32>, %dest: tensor<265x16x16x1xf3 // CHECK-SAME: outs(%{{[a-zA-Z0-9]*}} : tensor<265x16x16x1xf32>) // CHECK-SAME: permutation = [0, 2, 1, 3] %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.pack %src + %0 = linalg.pack %src padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %dest @@ -347,9 +347,9 @@ func.func @pack_with_pad(%src: tensor<4225x12xf32>, %dest: tensor<265x16x16x1xf3 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -370,7 +370,7 @@ func.func @pack_with_pad_and_outer_dims_perm(%src: tensor<100x200x127x255xi32>, // CHECK-SAME: outs(%{{.*}} : tensor<200x4x16x100x16x32xi32>) // CHECK-SAME: permutation = [1, 2, 4, 0, 5, 3] %cst_0 = arith.constant 0 : i32 - %0 = tensor.pack %src + %0 = linalg.pack %src padding_value(%cst_0 : i32) outer_dims_perm = [1, 2, 3, 0] inner_dims_pos = [3, 2] @@ -381,9 +381,9 @@ func.func @pack_with_pad_and_outer_dims_perm(%src: tensor<100x200x127x255xi32>, module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -429,7 +429,7 @@ func.func @dynamic_pack_pad_transpose_inner_and_outer_dims(%source: tensor - %pack = tensor.pack %source padding_value(%padding_value : f32) + %pack = linalg.pack %source padding_value(%padding_value : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %init_pack : tensor -> tensor return %pack : tensor @@ -437,9 +437,9 @@ func.func @dynamic_pack_pad_transpose_inner_and_outer_dims(%source: tensor !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -453,7 +453,7 @@ module attributes {transform.with_named_sequence} { func.func @pack_as_pad_with_outer_dims_perm(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x136x64x16x16xf32>) -> tensor<1x1x1x1x136x64x16x16xf32> { %cst_0 = arith.constant 0.0 : f32 - // tensor.pack is lowered to tensor.pad + tensor.insert_slice + // linalg.pack is lowered to tensor.pad + tensor.insert_slice // CHECK: %[[PAD:.*]] = tensor.pad %[[SRC]] low[0, 0, 0, 0] high[7, 17, 0, 0] // CHECK: : tensor<129x47x16x16xf32> to tensor<136x64x16x16xf32> // CHECK: %[[RES:.*]] = tensor.insert_slice %[[PAD]] into %[[OUT]] @@ -465,7 +465,7 @@ func.func @pack_as_pad_with_outer_dims_perm(%arg0: tensor<129x47x16x16xf32>, %ar // CHECK-SAME: [1, 1, 1, 1, 1, 1, 1, 1] // CHECK-SAME: : tensor<136x64x16x16xf32> into tensor<1x1x1x1x136x64x16x16xf32> // CHECK: return %[[RES]] - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 padding_value(%cst_0 : f32) outer_dims_perm = [1, 2, 3, 0] inner_dims_pos = [0, 1, 2, 3] @@ -476,9 +476,9 @@ func.func @pack_as_pad_with_outer_dims_perm(%arg0: tensor<129x47x16x16xf32>, %ar module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -501,7 +501,7 @@ func.func @pack_as_pad_with_unit_dims(%arg0: tensor<3x1x1x1xf32>, %arg1: tensor< // CHECK-SAME: outs(%[[OUT]] : tensor<1x1x1x1x8x1xf32>) // CHECK-SAME: permutation = [0, 2, 4, 5, 1, 3] // CHECK: return %[[TRANSPOSED]] : tensor<1x1x1x1x8x1xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 padding_value(%zero : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %arg1 : tensor<3x1x1x1xf32> -> tensor<1x1x1x1x8x1xf32> @@ -512,9 +512,9 @@ func.func @pack_as_pad_with_unit_dims(%arg0: tensor<3x1x1x1xf32>, %arg1: tensor< module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + %pack = transform.structured.match ops{["linalg.pack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) transform.yield } @@ -541,16 +541,16 @@ module attributes {transform.with_named_sequence} { // CHECK: linalg.copy ins(%[[SLICE]] : tensor<32x?x?xf32>) // CHECK-SAME: outs(%[[ARG1]] : tensor<32x?x?xf32>) func.func @unpack_with_dynamic_dest(%arg0: tensor<32x2x49x16x16xf32>, %arg1: tensor<32x?x?xf32>) -> tensor<32x?x?xf32> { - %pack = tensor.unpack %arg0 inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %arg1 + %pack = linalg.unpack %arg0 inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %arg1 : tensor<32x2x49x16x16xf32> -> tensor<32x?x?xf32> return %pack : tensor<32x?x?xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -582,15 +582,15 @@ module attributes {transform.with_named_sequence} { // CHECK: linalg.copy ins(%[[SLICE]] : tensor) // CHECK-SAME: outs(%[[ARG1]] : tensor) func.func @unpack_with_dynamic_input_dest(%arg0: tensor, %arg1: tensor) -> tensor { - %unpack = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %arg1 : tensor -> tensor + %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %arg1 : tensor -> tensor return %unpack : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -626,14 +626,14 @@ module attributes {transform.with_named_sequence} { // CHECK: linalg.copy ins(%[[SLICE]] : tensor) // CHECK-SAME: outs(%[[ARG1]] : tensor) func.func @unpack_fully_dynamic(%source: tensor, %dest: tensor, %tile_n : index, %tile_m : index) -> tensor { - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor return %0 : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -664,16 +664,16 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: [1, 1, 1, 1, 1, 1, 1, 1] // CHECK-SAME: : tensor<1x1x1x1x136x64x16x16xf32> to tensor func.func @unpack_as_pad_dynamic(%arg0: tensor<1x1x1x1x136x64x16x16xf32>, %arg1: tensor) -> tensor { - %pack = tensor.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 + %pack = linalg.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1 : tensor<1x1x1x1x136x64x16x16xf32> -> tensor return %pack : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -698,16 +698,16 @@ module attributes {transform.with_named_sequence} { // CHECK: linalg.copy ins(%[[SLICE]] // CHECK-SAME: : tensor<32x64xf32>) outs(%[[ARG0]] : tensor<32x64xf32>) -> tensor<32x64xf32> func.func @unpack_with_outer_dims_perm(%arg0: tensor<32x64xf32>, %arg1: tensor<2x4x32x8xf32>) -> tensor<32x64xf32> { - %unpack = tensor.unpack %arg1 outer_dims_perm = [1, 0] + %unpack = linalg.unpack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg0 : tensor<2x4x32x8xf32> -> tensor<32x64xf32> return %unpack : tensor<32x64xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op - : (!transform.any_op) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + %unpack = transform.structured.match ops{["linalg.unpack"]} in %module_op + : (!transform.any_op) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir index ac1ca9319d335..20019424e8d3c 100644 --- a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir @@ -106,12 +106,12 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @unpack_elemwise // CHECK: %[[RES:.*]] = scf.for // CHECK: scf.for -// CHECK: tensor.unpack +// CHECK: linalg.unpack // CHECK: linalg.elemwise_unary // CHECK: return %[[RES]] func.func @unpack_elemwise(%arg0: tensor<16x48x8x8xf32>, %arg1: tensor<128x384xf32>) -> tensor<128x384xf32> { %0 = tensor.empty() : tensor<128x384xf32> - %1 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 + %1 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 : tensor<16x48x8x8xf32> -> tensor<128x384xf32> %2 = linalg.elemwise_unary ins(%1: tensor<128x384xf32>) outs(%arg1: tensor<128x384xf32>) -> tensor<128x384xf32> @@ -132,12 +132,12 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @pack_elemwise // CHECK: %[[RES:.*]] = scf.for // CHECK: scf.for -// CHECK: tensor.pack +// CHECK: linalg.pack // CHECK: linalg.elemwise_unary // CHECK: return %[[RES]] func.func @pack_elemwise(%arg0: tensor<128x384xf32>, %arg1: tensor<16x48x8x8xf32>) -> tensor<16x48x8x8xf32> { %0 = tensor.empty() : tensor<16x48x8x8xf32> - %1 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 + %1 = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 : tensor<128x384xf32> -> tensor<16x48x8x8xf32> %2 = linalg.elemwise_unary ins(%1: tensor<16x48x8x8xf32>) outs(%arg1: tensor<16x48x8x8xf32>) -> tensor<16x48x8x8xf32> @@ -156,14 +156,14 @@ module attributes {transform.with_named_sequence} { // ----- // CHECK-LABEL: func.func @nofuse_pack_elemwise -// CHECK: tensor.pack +// CHECK: linalg.pack // CHECK: %[[RES:.*]] = scf.for // CHECK: scf.for // CHECK: linalg.elemwise_unary // CHECK: return %[[RES]] func.func @nofuse_pack_elemwise(%arg0: tensor<128x384xf32>, %arg1: tensor<16x48x8x8xf32>) -> tensor<16x48x8x8xf32> { %0 = tensor.empty() : tensor<16x48x8x8xf32> - %1 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 + %1 = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %0 : tensor<128x384xf32> -> tensor<16x48x8x8xf32> %2 = linalg.elemwise_unary ins(%1: tensor<16x48x8x8xf32>) outs(%arg1: tensor<16x48x8x8xf32>) -> tensor<16x48x8x8xf32> diff --git a/mlir/test/Dialect/Linalg/transform-op-pack.mlir b/mlir/test/Dialect/Linalg/transform-op-pack.mlir index 6c26ebd0a5b84..b3ad73e8df8e7 100644 --- a/mlir/test/Dialect/Linalg/transform-op-pack.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-pack.mlir @@ -15,9 +15,9 @@ // CHECK-SAME: %[[T1:.+]]: tensor<3xf16> func.func @reduction_2d_static(%t0: tensor<3x7xf16>, %t1: tensor<3xf16>) -> tensor<3xf16> { // CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<3x2x4xf16> - // CHECK: %[[PACKED:.*]] = tensor.pack %[[T0]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED:.*]] = linalg.pack %[[T0]] padding_value(%{{.*}} : f16) // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [4] into %[[EMPTY]] : tensor<3x7xf16> -> tensor<3x2x4xf16> - // CHECK-NOT: tensor.pack + // CHECK-NOT: linalg.pack // CHECK: linalg.generic // CHECK-SAME: indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]]] // CHECK-SAME: iterator_types = ["parallel", "reduction", "reduction"] @@ -29,7 +29,7 @@ func.func @reduction_2d_static(%t0: tensor<3x7xf16>, %t1: tensor<3xf16>) -> tens linalg.yield %3 : f16 } -> tensor<3xf16> - // CHECK-NOT: tensor.unpack + // CHECK-NOT: linalg.unpack return %2 : tensor<3xf16> } @@ -59,9 +59,9 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[T1:.+]]: tensor<3xf16> func.func @col_reduction_2d_static(%t0: tensor<7x3xf16>, %t1: tensor<3xf16>) -> tensor<3xf16> { // CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<3x2x4xf16> - // CHECK: %[[PACKED:.*]] = tensor.pack %[[T0]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED:.*]] = linalg.pack %[[T0]] padding_value(%{{.*}} : f16) // CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [0] inner_tiles = [4] into %[[EMPTY]] : tensor<7x3xf16> -> tensor<3x2x4xf16> - // CHECK-NOT: tensor.pack + // CHECK-NOT: linalg.pack // CHECK: linalg.generic // CHECK-SAME: indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]]] // CHECK-SAME: iterator_types = ["reduction", "parallel", "reduction"] @@ -73,7 +73,7 @@ func.func @col_reduction_2d_static(%t0: tensor<7x3xf16>, %t1: tensor<3xf16>) -> linalg.yield %3 : f16 } -> tensor<3xf16> - // CHECK-NOT: tensor.unpack + // CHECK-NOT: linalg.unpack return %2 : tensor<3xf16> } @@ -83,12 +83,12 @@ module attributes {transform.with_named_sequence} { %1 = transform.structured.pack %0 packed_sizes = [4, 0] : (!transform.any_op) -> (!transform.op<"linalg.generic">) %pack = transform.get_producer_of_operand %1[0] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.pack">) %2, %pack_2, %empty_unpack_2 = transform.structured.pack_transpose %pack with_compute_op(%1) outer_perm = [1, 0] - : (!transform.op<"tensor.pack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.any_op) + : (!transform.op<"linalg.pack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.any_op) transform.yield } } @@ -116,9 +116,9 @@ func.func @reduction_2d_dynamic(%t0: tensor, %t1: tensor) -> ten // CHECK-DAG: %[[D1:.*]] = tensor.dim %[[T0]], %[[C1]] : tensor // CHECK: %[[D1B4:.*]] = affine.apply #[[$DIV4]]()[%[[D1]]] // CHECK: %[[EMPTY:.*]] = tensor.empty(%[[D0]], %[[D1B4]]) : tensor - // CHECK: %[[PACKED:.*]] = tensor.pack %[[T0]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED:.*]] = linalg.pack %[[T0]] padding_value(%{{.*}} : f16) // CHECK-SAME: inner_dims_pos = [1] inner_tiles = [4] into %[[EMPTY]] : tensor -> tensor - // CHECK-NOT: tensor.pack + // CHECK-NOT: linalg.pack // CHECK: linalg.generic // CHECK-SAME: indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]]] // CHECK-SAME: iterator_types = ["parallel", "reduction", "reduction"] @@ -130,7 +130,7 @@ func.func @reduction_2d_dynamic(%t0: tensor, %t1: tensor) -> ten linalg.yield %3 : f16 } -> tensor - // CHECK-NOT: tensor.unpack + // CHECK-NOT: linalg.unpack return %2 : tensor } @@ -162,11 +162,11 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[T0:.+]]: tensor, // CHECK-SAME: %[[T1:.+]]: tensor func.func @reduction_2d_dynamic(%t0: tensor, %t1: tensor) -> tensor { - // CHECK: %[[PACKED_0:.*]] = tensor.pack %[[T0]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED_0:.*]] = linalg.pack %[[T0]] padding_value(%{{.*}} : f16) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [3, 4] into %{{.*}} : tensor -> tensor - // CHECK: %[[PACKED_1:.*]] = tensor.pack %[[T1]] padding_value(%{{.*}} : f16) + // CHECK: %[[PACKED_1:.*]] = linalg.pack %[[T1]] padding_value(%{{.*}} : f16) // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [3] into %{{.*}} : tensor -> tensor - // CHECK-NOT: tensor.pack + // CHECK-NOT: linalg.pack // CHECK: linalg.generic // CHECK-SAME: indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]]] // CHECK-SAME: iterator_types = ["parallel", "reduction", "parallel", "reduction"] @@ -178,7 +178,7 @@ func.func @reduction_2d_dynamic(%t0: tensor, %t1: tensor) -> ten linalg.yield %3 : f16 } -> tensor - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0] inner_tiles = [3] into %{{.*}} : tensor -> tensor + // CHECK: linalg.unpack %{{.*}} inner_dims_pos = [0] inner_tiles = [3] into %{{.*}} : tensor -> tensor return %2 : tensor } @@ -207,11 +207,11 @@ module attributes {transform.with_named_sequence} { func.func @matmul(%A: tensor, %B: tensor, %C: tensor) -> tensor { - // CHECK: %[[PACK_A:.*]] = tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [2, 4] + // CHECK: %[[PACK_A:.*]] = linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [2, 4] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_B:.*]] = tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [3, 4] + // CHECK: %[[PACK_B:.*]] = linalg.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [3, 4] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_C:.*]] = tensor.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [3, 2] + // CHECK: %[[PACK_C:.*]] = linalg.pack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [3, 2] // CHECK-SAME: : tensor -> tensor // CHECK: linalg.generic {indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]], #[[$PACKED_MAP_2]]] @@ -222,7 +222,7 @@ func.func @matmul(%A: tensor, %B: tensor, %C: tensor) outs(%C: tensor) -> tensor - // CHECK: tensor.unpack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [3, 2] + // CHECK: linalg.unpack %{{.*}} outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [3, 2] // CHECK-SAME: : tensor -> tensor return %0 : tensor } @@ -235,12 +235,12 @@ module attributes {transform.with_named_sequence} { : (!transform.any_op) -> (!transform.op<"linalg.generic">) %unpack = transform.get_consumers_of_result %1[0] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.unpack">) %2, %pack_2, %unpack_2 = transform.structured.pack_transpose %unpack with_compute_op(%1) outer_perm = [1, 0] inner_perm = [1, 0] - : (!transform.op<"tensor.unpack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.op<"linalg.unpack">) transform.yield } } @@ -259,11 +259,11 @@ module attributes {transform.with_named_sequence} { func.func @conv_2d_nchw_fchw(%i: tensor<14x512x28x28xf32>, %f: tensor<1024x512x1x1xf32>, %o: tensor<14x1024x28x28xf32>) -> tensor<14x1024x28x28xf32> { - // CHECK: %[[PACK_INPUT:.*]] = tensor.pack %{{.*}} inner_dims_pos = [1] inner_tiles = [8] + // CHECK: %[[PACK_INPUT:.*]] = linalg.pack %{{.*}} inner_dims_pos = [1] inner_tiles = [8] // CHECK-SAME: : tensor<14x512x28x28xf32> -> tensor<14x64x28x28x8xf32> - // CHECK: %[[PACK_FILTER:.*]] = tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] + // CHECK: %[[PACK_FILTER:.*]] = linalg.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] // CHECK-SAME: : tensor<1024x512x1x1xf32> -> tensor<256x64x1x1x4x8xf32> - // CHECK: %[[PACK_INPUT:.*]] = tensor.pack %{{.*}} inner_dims_pos = [1] inner_tiles = [4] + // CHECK: %[[PACK_INPUT:.*]] = linalg.pack %{{.*}} inner_dims_pos = [1] inner_tiles = [4] // CHECK-SAME: : tensor<14x1024x28x28xf32> -> tensor<14x256x28x28x4xf32> // CHECK: linalg.generic {indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]], #[[$PACKED_MAP_2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "parallel", "reduction"]} @@ -272,7 +272,7 @@ func.func @conv_2d_nchw_fchw(%i: tensor<14x512x28x28xf32>, %f: tensor<1024x512x1 %0 = linalg.conv_2d_nchw_fchw ins(%i, %f: tensor<14x512x28x28xf32>, tensor<1024x512x1x1xf32>) outs(%o: tensor<14x1024x28x28xf32>) -> tensor<14x1024x28x28xf32> - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [1] inner_tiles = [4] + // CHECK: linalg.unpack %{{.*}} inner_dims_pos = [1] inner_tiles = [4] // CHECK-SAME: : tensor<14x256x28x28x4xf32> -> tensor<14x1024x28x28xf32> return %0: tensor<14x1024x28x28xf32> } @@ -300,11 +300,11 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[INIT:.+]]: tensor func.func @conv_2d_nhwc_hwcf(%input: tensor, %filter: tensor<1x?x?x?xf32>, %init: tensor) -> tensor { - // CHECK: %[[PACK_INPUT:.*]] = tensor.pack %{{.*}} inner_dims_pos = [3] inner_tiles = [6] + // CHECK: %[[PACK_INPUT:.*]] = linalg.pack %{{.*}} inner_dims_pos = [3] inner_tiles = [6] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_FILTER:.*]] = tensor.pack %{{.*}} inner_dims_pos = [3, 2] inner_tiles = [4, 6] + // CHECK: %[[PACK_FILTER:.*]] = linalg.pack %{{.*}} inner_dims_pos = [3, 2] inner_tiles = [4, 6] // CHECK-SAME: : tensor<1x?x?x?xf32> -> tensor<1x?x?x?x4x6xf32> - // CHECK: %[[PACK_OUTPUT:.*]] = tensor.pack %{{.*}} inner_dims_pos = [3] inner_tiles = [4] + // CHECK: %[[PACK_OUTPUT:.*]] = linalg.pack %{{.*}} inner_dims_pos = [3] inner_tiles = [4] // CHECK-SAME: : tensor -> tensor // CHECK: linalg.generic {indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]], #[[$PACKED_MAP_2]]] @@ -315,7 +315,7 @@ func.func @conv_2d_nhwc_hwcf(%input: tensor, %filter: tensor<1x?x?x ins (%input, %filter: tensor, tensor<1x?x?x?xf32>) outs (%init: tensor) -> tensor - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [3] inner_tiles = [4] + // CHECK: linalg.unpack %{{.*}} inner_dims_pos = [3] inner_tiles = [4] // CHECK-SAME: : tensor -> tensor return %0 : tensor } @@ -349,11 +349,11 @@ func.func @matmul_dynamic_pack_size(%A: tensor, %B: tensor, %C // CHECK: %[[TS:.*]] = "some_tile_size"() : () -> index %sz = "some_tile_size"() : () -> (index) - // CHECK: %[[PACK_A:.*]] = tensor.pack %[[A]] {{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] + // CHECK: %[[PACK_A:.*]] = linalg.pack %[[A]] {{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_B:.*]] = tensor.pack %[[B]] {{.*}} inner_dims_pos = [1, 0] inner_tiles = [%[[TS]], %[[TS]]] + // CHECK: %[[PACK_B:.*]] = linalg.pack %[[B]] {{.*}} inner_dims_pos = [1, 0] inner_tiles = [%[[TS]], %[[TS]]] // CHECK-SAME: : tensor -> tensor - // CHECK: %[[PACK_C:.*]] = tensor.pack %[[C]] {{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] + // CHECK: %[[PACK_C:.*]] = linalg.pack %[[C]] {{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] // CHECK-SAME: : tensor -> tensor // CHECK: linalg.generic {indexing_maps = [#[[$PACKED_MAP_0]], #[[$PACKED_MAP_1]], #[[$PACKED_MAP_2]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "reduction"]} @@ -363,7 +363,7 @@ func.func @matmul_dynamic_pack_size(%A: tensor, %B: tensor, %C outs(%C: tensor) -> tensor - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] into %[[C]] + // CHECK: linalg.unpack %{{.*}} inner_dims_pos = [1] inner_tiles = [%[[TS]]] into %[[C]] // CHECK-SAME: : tensor -> tensor return %0 : tensor } @@ -445,16 +445,16 @@ module attributes {transform.with_named_sequence} { // ----- func.func @no_single_packing_op(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) { - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> - %1 = tensor.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> - %2 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %1 = linalg.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> + %2 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> return } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{requires target to map to exactly 1 packing op and 1 packed op (got 2 and 1)}} transform.structured.pack_transpose %0 with_compute_op(%1) inner_perm = [0] @@ -476,7 +476,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["tensor.empty"]} in %arg1 : (!transform.any_op) -> !transform.any_op - // expected-error @below {{requires target to map to a tensor.pack or tensor.unpack}} + // expected-error @below {{requires target to map to a linalg.pack or linalg.unpack}} transform.structured.pack_transpose %0 with_compute_op(%1) inner_perm = [0] : (!transform.any_op, !transform.any_op) @@ -488,14 +488,14 @@ module attributes {transform.with_named_sequence} { // ----- func.func @no_linalg_target(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) { - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> %1 = arith.constant 0 : index return } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{requires a LinalgOp target}} transform.structured.pack_transpose %0 with_compute_op(%1) @@ -509,7 +509,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @no_single_use_by_linalg(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) { - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %0 = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> %f0 = arith.constant 0.0 : f32 %1 = tensor.empty() : tensor %2 = linalg.fill ins(%f0: f32) outs(%1 : tensor) -> tensor @@ -518,7 +518,7 @@ func.func @no_single_use_by_linalg(%source: tensor<128x256xf32>, %dest: tensor<4 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{not a single use by the LinalgOp target}} transform.structured.pack_transpose %0 with_compute_op(%1) @@ -532,8 +532,8 @@ module attributes {transform.with_named_sequence} { // ----- func.func @not_produced_by_linalg(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) { - %a = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> - %b = tensor.unpack %a inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> + %a = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> + %b = linalg.unpack %a inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> %f0 = arith.constant 0.0 : f32 %1 = tensor.empty() : tensor %2 = linalg.fill ins(%f0: f32) outs(%1 : tensor) -> tensor @@ -542,7 +542,7 @@ func.func @not_produced_by_linalg(%source: tensor<128x256xf32>, %dest: tensor<4x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{not produced by the LinalgOp target}} transform.structured.pack_transpose %0 with_compute_op(%1) @@ -559,13 +559,13 @@ func.func @no_matching_pack(%source: tensor<16xf32>) { %f0 = arith.constant 0.0 : f32 %1 = tensor.empty() : tensor<4x4xf32> %2 = linalg.fill ins(%f0: f32) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32> - %b = tensor.unpack %2 inner_dims_pos = [0] inner_tiles = [4] into %source : tensor<4x4xf32> -> tensor<16xf32> + %b = linalg.unpack %2 inner_dims_pos = [0] inner_tiles = [4] into %source : tensor<4x4xf32> -> tensor<16xf32> return } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op %1 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op // expected-error @below {{could not find matching pack op}} transform.structured.pack_transpose %0 with_compute_op(%1) @@ -593,13 +593,13 @@ module attributes {transform.with_named_sequence} { : (!transform.any_op) -> (!transform.op<"linalg.generic">) %unpack = transform.get_consumers_of_result %1[0] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.unpack">) %2, %pack_2, %unpack_2 = // expected-error @below {{invalid outer_perm}} transform.structured.pack_transpose %unpack with_compute_op(%1) outer_perm = [1] - : (!transform.op<"tensor.unpack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.op<"linalg.unpack">) transform.yield } } @@ -621,13 +621,13 @@ module attributes {transform.with_named_sequence} { : (!transform.any_op) -> (!transform.op<"linalg.generic">) %unpack = transform.get_consumers_of_result %1[0] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.unpack">) %2, %pack_2, %unpack_2 = // expected-error @below {{invalid inner_perm}} transform.structured.pack_transpose %unpack with_compute_op(%1) inner_perm = [1] - : (!transform.op<"tensor.unpack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.op<"linalg.unpack">) transform.yield } } @@ -643,12 +643,12 @@ func.func @no_padding_on_packs(%A: tensor<32x32xf32>, %B: tensor<32x32xf32>, %C: } // CHECK-LABEL: no_padding_on_packs -// CHECK: tensor.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] +// CHECK: linalg.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<8x4x4x8xf32> -// CHECK: tensor.pack %{{.+}} outer_dims_perm = [1, 0] +// CHECK: linalg.pack %{{.+}} outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 8] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<4x4x8x8xf32> -// CHECK: tensor.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] +// CHECK: linalg.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [4, 8] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<8x4x4x8xf32> module attributes {transform.with_named_sequence} { @@ -657,12 +657,12 @@ module attributes {transform.with_named_sequence} { %1 = transform.structured.pack %0 packed_sizes = [4, 8, 8] : (!transform.any_op) -> (!transform.op<"linalg.generic">) %pack = transform.get_producer_of_operand %1[1] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.pack">) %2, %pack_2, %empty_unpack_2 = transform.structured.pack_transpose %pack with_compute_op(%1) outer_perm = [1, 0] inner_perm = [1, 0] - : (!transform.op<"tensor.pack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.any_op) + : (!transform.op<"linalg.pack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.any_op) transform.yield } } diff --git a/mlir/test/Dialect/Linalg/transform-op-tile-pack-unpack.mlir b/mlir/test/Dialect/Linalg/transform-op-tile-pack-unpack.mlir new file mode 100644 index 0000000000000..456a5ea453963 --- /dev/null +++ b/mlir/test/Dialect/Linalg/transform-op-tile-pack-unpack.mlir @@ -0,0 +1,491 @@ +// RUN: mlir-opt %s -transform-interpreter -canonicalize -cse -split-input-file | FileCheck %s + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 32)> +// CHECK: func.func @NC_to_NCnc +// CHECK-SAME: %[[IN:.*]]: tensor<128x256xf32>, +// CHECK-SAME: %[[OUT:.*]]: tensor<4x8x32x32xf32>) -> tensor<4x8x32x32xf32> { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[RES0:.*]] = scf.for %[[N:.*]] = %[[C0]] to %[[C4]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor<4x8x32x32xf32>) { +// CHECK: %[[RES1:.+]] = scf.for %[[C:.*]] = %[[C0]] to %[[C8]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor<4x8x32x32xf32>) { +// CHECK-DAG: %[[IN_N:.+]] = affine.apply #[[MAP0]](%[[N]]) +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP0]](%[[C]]) +// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_N]], %[[IN_C]]] [64, 128] [1, 1] : tensor<128x256xf32> to tensor<64x128xf32> +// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[N]], %[[C]], 0, 0] [2, 4, 32, 32] [1, 1, 1, 1] : tensor<4x8x32x32xf32> to tensor<2x4x32x32xf32> +// CHECK: %[[SUB_RES:.*]] = linalg.pack +// CHECK-SAME: %[[SUB_IN]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[SUB_OUT]] +// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] +// CHECK: scf.yield %[[INSERT]] : tensor<4x8x32x32xf32> +// CHECK: } +// CHECK: scf.yield %[[RES1:.*]] : tensor<4x8x32x32xf32> +// CHECK: } +// CHECK: return %[[RES0:.*]] : tensor<4x8x32x32xf32> +// CHECK: } +func.func @NC_to_NCnc(%arg0: tensor<128x256xf32>, %arg1: tensor<4x8x32x32xf32>) -> tensor<4x8x32x32xf32> { + %0 = linalg.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg1 : tensor<128x256xf32> -> tensor<4x8x32x32xf32> + return %0 : tensor<4x8x32x32xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 8)> +// CHECK: func.func @KC_to_CKkc +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index +// CHECK: scf.for %[[C:.+]] = %[[C0]] to %[[C32]] step %[[C2]] +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP0]](%[[C]]) +// CHECK: %[[INPUT_SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK-SAME: [0, %[[IN_C]]] [128, 16] +// CHECK: %[[OUTPUT_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[C]], 0, 0, 0] [2, 4, 32, 8] +// CHECK: linalg.pack +// CHECK-SAME: %[[INPUT_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] +// CHECK-SAME: into %[[OUTPUT_SLICE]] +func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) -> tensor<32x4x32x8xf32> { + %0 = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<128x256xf32> -> tensor<32x4x32x8xf32> + return %0 : tensor<32x4x32x8xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * -2 + 15, 8)> +// CHECK: func.func @pad_and_pack_static( +// CHECK-SAME: %[[IN:.*]]: tensor<13x15xf32>, +// CHECK-SAME: %[[OUT:.*]]: tensor<2x8x8x2xf32>, +// CHECK-SAME: %[[PAD:.*]]: f32) -> tensor<2x8x8x2xf32> { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[RES0:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[C8]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[OUT]]) -> (tensor<2x8x8x2xf32>) { +// CHECK-DAG: %[[IN_J:.*]] = affine.apply #[[MAP0]](%[[J]]) +// CHECK-DAG: %[[IN_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]]) +// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][0, %[[IN_J]]] [13, %[[IN_J_SZ]]] [1, 1] +// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][0, %[[J]], 0, 0] [2, 4, 8, 2] [1, 1, 1, 1] +// CHECK: %[[SUB_RES:.*]] = linalg.pack +// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] +// CHECK-SAME: into %[[SUB_OUT]] +// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] +// CHECK: scf.yield %[[INSERT]] : tensor<2x8x8x2xf32> +// CHECK: } +// CHECK: return %[[RES0:.*]] : tensor<2x8x8x2xf32> +// CHECK: } +func.func @pad_and_pack_static(%input: tensor<13x15xf32>, %output: tensor<2x8x8x2xf32>, %pad: f32) -> tensor<2x8x8x2xf32> { + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<13x15xf32> -> tensor<2x8x8x2xf32> + return %0 : tensor<2x8x8x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 * 8)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -8 + s0, d0 * 8)> +// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)> +// CHECK: func.func @pad_and_pack_partially_dynamic( +// CHECK-SAME: %[[IN:.*]]: tensor, +// CHECK-SAME: %[[OUT:.*]]: tensor, +// CHECK-SAME: %[[PAD:.*]]: f32) -> tensor { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[OUT_D0:.*]] = tensor.dim %[[OUT]], %[[C0]] : tensor +// CHECK-DAG: %[[OUT_D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor +// CHECK: %[[RES0:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[OUT_D0]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor) { +// CHECK: %[[RES1:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[OUT_D1]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor) { +// CHECK-DAG: %[[OUT_I_SZ:.*]] = affine.min #[[MAP0]](%[[I]])[%[[OUT_D0]]] +// CHECK-DAG: %[[OUT_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]])[%[[OUT_D1]]] +// CHECK-DAG: %[[IN_I:.*]] = affine.apply #[[MAP2]](%[[I]]) +// CHECK-DAG: %[[IN_I_SZ:.*]] = affine.min #[[MAP3]] +// CHECK-DAG: %[[IN_J:.*]] = affine.apply #[[MAP4]](%[[J]]) +// CHECK-DAG: %[[IN_J_SZ:.*]] = affine.min #[[MAP5]] +// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_I]], %[[IN_J]]] [%[[IN_I_SZ]], %[[IN_J_SZ]]] [1, 1] : tensor to tensor +// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[I]], %[[J]], 0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]], 8, 2] [1, 1, 1, 1] : tensor to tensor +// CHECK: %[[SUB_RES:.*]] = linalg.pack +// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] +// CHECK-SAME: into %[[SUB_OUT]] +// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] +// CHECK: scf.yield %[[INSERT]] : tensor +// CHECK: } +// CHECK: scf.yield %[[RES1:.*]] : tensor +// CHECK: } +// CHECK: return %[[VAL_34:.*]] : tensor +// CHECK: } +func.func @pad_and_pack_partially_dynamic(%input: tensor, %output: tensor, %pad: f32) -> tensor { + %0 = linalg.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor -> tensor + return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s0, -(d1 * s0) + s1)> +// CHECK: func.func @pad_and_pack_fully_dynamic( +// CHECK-SAME: %[[IN:.*]]: tensor, +// CHECK-SAME: %[[OUT:.*]]: tensor, +// CHECK-SAME: %[[PAD:.*]]: f32, +// CHECK-SAME: %[[TILE_0:.*]]: index, +// CHECK-SAME: %[[TILE_1:.*]]: index) -> tensor { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[OUT_D0:.*]] = tensor.dim %[[OUT]], %[[C0]] : tensor +// CHECK-DAG: %[[OUT_D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor +// CHECK: %[[RES0:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[OUT_D0]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor) { +// CHECK: %[[RES1:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[OUT_D1]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor) { +// CHECK-DAG: %[[OUT_I_SZ:.*]] = affine.min #[[MAP0]](%[[I]])[%[[OUT_D0]]] +// CHECK-DAG: %[[OUT_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]])[%[[OUT_D1]]] +// CHECK-DAG: %[[IN_D0:.*]] = tensor.dim %[[IN]], %[[C0]] +// CHECK-DAG: %[[IN_D1:.*]] = tensor.dim %[[IN]], %[[C1]] +// CHECK: %[[IN_I:.*]] = affine.apply #[[MAP2]](%[[I]])[%[[TILE_0]]] +// CHECK: %[[IN_I_SZ:.*]] = affine.min #[[MAP3]](%[[OUT_I_SZ]], %[[I]])[%[[TILE_0]], %[[IN_D0]]] +// CHECK: %[[IN_J:.*]] = affine.apply #[[MAP2]](%[[J]])[%[[TILE_1]]] +// CHECK: %[[IN_J_SZ:.*]] = affine.min #[[MAP3]](%[[OUT_J_SZ]], %[[J]])[%[[TILE_1]], %[[IN_D1]]] +// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_I]], %[[IN_J]]] [%[[IN_I_SZ]], %[[IN_J_SZ]]] [1, 1] : tensor to tensor +// CHECK: %[[OUT_D2:.+]] = tensor.dim %[[ITER1]], %[[C2]] +// CHECK: %[[OUT_D3:.+]] = tensor.dim %[[ITER1]], %[[C3]] +// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[I]], %[[J]], 0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]], %[[OUT_D2]], %[[OUT_D3]]] [1, 1, 1, 1] : tensor to tensor +// CHECK: %[[PACK:.*]] = linalg.pack +// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_0]], %[[TILE_1]]] +// CHECK-SAME: into %[[SUB_OUT]] +// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[PACK]] into %[[ITER1]] +// CHECK: scf.yield %[[INSERT]] : tensor +// CHECK: } +// CHECK: scf.yield %[[RES1:.*]] : tensor +// CHECK: } +// CHECK: return %[[RES0:.*]] : tensor +// CHECK: } +func.func @pad_and_pack_fully_dynamic(%source: tensor, %dest: tensor, %pad: f32, %tile_n : index, %tile_m : index) -> tensor { + %0 = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor + return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)> +// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 16 - d0 floordiv 16 + 1)> +// CHECK: func.func @NCnc_to_NC +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index +// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index +// CHECK: %{{.+}} = scf.for %[[I:.+]] = %[[C0]] to %[[C256]] step %[[C2]] +// CHECK: %{{.+}} = scf.for %[[J:.+]] = %[[C0]] to %[[C128]] step %[[C4]] +// CHECK-DAG: %[[IN_I:.+]] = affine.apply #[[MAP0]](%[[I]]) +// CHECK-DAG: %[[OFFSET_I:.+]] = affine.apply #[[MAP1]](%[[I]]) +// CHECK-DAG: %[[IN_I_SZ:.+]] = affine.apply #[[MAP2]](%[[I]]) +// CHECK-DAG: %[[IN_J:.+]] = affine.apply #[[MAP4]](%[[J]]) +// CHECK-DAG: %[[OFFSET_J:.+]] = affine.apply #[[MAP5]](%[[J]]) +// CHECK-DAG: %[[IN_J_SZ:.+]] = affine.apply #[[MAP6]](%[[J]]) +// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK-SAME: [%[[IN_I]], %[[IN_J]], 0, 0] [%[[IN_I_SZ]], %[[IN_J_SZ]], 32, 16] +// CHECK-SAME: : tensor<8x8x32x16xf32> to tensor +// CHECK: %[[EMPTY:.+]] = tensor.empty +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[SLICE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] +// CHECK-SAME: into %[[EMPTY]] +// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] +// CHECK-SAME: [%[[OFFSET_I]], %[[OFFSET_J]]] [2, 4] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]] +// CHECK-SAME: into %{{.+}}[%[[I]], %[[J]]] [2, 4] +// CHECK: scf.yield %[[RES]] +func.func @NCnc_to_NC(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + return %0 : tensor<256x128xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)> +// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 8)> +// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 8)> +// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 8 - d0 floordiv 8 + 1)> +// CHECK: func.func @CKkc_to_KC +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index +// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index +// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C128]] step %[[C2]] +// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C256]] step %[[C4]] +// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]]) +// CHECK-DAG: %[[OFFSET_K:.+]] = affine.apply #[[MAP1]](%[[K]]) +// CHECK-DAG: %[[IN_K_SZ:.+]] = affine.apply #[[MAP2]](%[[K]]) +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP4]](%[[C]]) +// CHECK-DAG: %[[OFFSET_C:.+]] = affine.apply #[[MAP5]](%[[C]]) +// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP6]](%[[C]]) +// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], %[[IN_K_SZ]], 32, 8] +// CHECK: %[[EMPTY:.+]] = tensor.empty +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] +// CHECK-SAME: into %[[EMPTY]] +// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] +// CHECK-SAME: [%[[OFFSET_K]], %[[OFFSET_C]]] [2, 4] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]] +// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4] +// CHECK: scf.yield %[[RES]] +func.func @CKkc_to_KC(%source: tensor<32x4x32x8xf32>, %dest: tensor<128x256xf32>) -> tensor<128x256xf32> { + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %dest : tensor<32x4x32x8xf32> -> tensor<128x256xf32> + return %0 : tensor<128x256xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 floordiv 4)> +// CHECK: func.func @perfect_CKkc_to_KC +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C2]] +// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C128]] step %[[C4]] +// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]]) +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP1]](%[[C]]) +// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [1, 1, 2, 4] +// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [2, 4] +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] +// CHECK-SAME: into %[[ITER_SLICE]] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] +// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4] +// CHECK: scf.yield %[[RES]] +func.func @perfect_CKkc_to_KC(%source: tensor<32x4x2x4xf32>, %dest: tensor<8x128xf32>) -> tensor<8x128xf32> { + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %dest : tensor<32x4x2x4xf32> -> tensor<8x128xf32> + return %0 : tensor<8x128xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 2)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 2)> +// CHECK: func.func @dynamic_perfect_CKkc_to_KC +// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: +// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[DIM_0:.+]] = tensor.dim %[[OUT]], %[[C0]] +// CHECK-DAG: %[[DIM_1:.+]] = tensor.dim %[[OUT]], %[[C1]] +// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[DIM_0]] step %[[C2]] +// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[DIM_1]] step %[[C4]] +// CHECK-DAG: %[[OUT_K_SZ:.+]] = affine.min #[[MAP0]](%[[K]])[%[[DIM_0]]] +// CHECK-DAG: %[[OUT_C_SZ:.+]] = affine.min #[[MAP1]](%[[C]])[%[[DIM_1]]] +// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP2]](%[[K]]) +// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP2]](%[[C]]) +// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP3]](%[[OUT_C_SZ]]) +// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] +// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], 1, 2, 2] +// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] +// CHECK-SAME: into %[[ITER_SLICE]] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] +// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]] +// CHECK: scf.yield %[[RES]] + +func.func @dynamic_perfect_CKkc_to_KC(%source: tensor, %dest: tensor) -> tensor { + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %dest : tensor -> tensor + return %0 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK: #[[MAP:.+]] = affine_map<(d0) -> (d0 floordiv 2)> +// CHECK: func.func @perfect_NKPQk_to_NPQK( +// CHECK-SAME: %[[SOURCE:.+]]: tensor<1x4x6x6x2xf32>, +// CHECK-SAME: %{{.+}}: tensor<1x6x6x8xf32>) +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index +// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %{{.+}} = scf.for %[[P:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %{{.+}} = scf.for %[[Q:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C4]] +// CHECK: %[[K_SZ:.+]] = affine.apply #[[MAP]](%[[K]]) +// CHECK: %[[SLICE_SOURCE:.+]] = tensor.extract_slice %[[SOURCE]][0, %[[K_SZ]], %[[P]], %[[Q]], 0] +// CHECK: %[[SLICE_DEST:.+]] = tensor.extract_slice %{{.+}}[0, %[[P]], %[[Q]], %[[K]]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack +// CHECK-SAME: %[[SLICE_SOURCE]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] +// CHECK-SAME: into %[[SLICE_DEST]] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] +// CHECK-SAME: into %{{.+}}[0, %[[P]], %[[Q]], %[[K]]] +// CHECK: scf.yield %[[RES]] + +func.func @perfect_NKPQk_to_NPQK(%source: tensor<1x4x6x6x2xf32>, %dest: tensor<1x6x6x8xf32>) -> tensor<1x6x6x8xf32> { + %0 = linalg.unpack %source outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %dest : tensor<1x4x6x6x2xf32> -> tensor<1x6x6x8xf32> + return %0 : tensor<1x6x6x8xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +func.func private @get_dynamic_tile_size() -> index + +// CHECK-LABEL: func.func @fully_dynamic_unpack +// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] +// CHECK-SAME: %[[DST:[0-9a-zA-Z]+]] +// CHECK: %[[INNER_TS:.+]] = call @get_dynamic_tile_size() : () -> index +// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[DST]]) +// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) +// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]] +// CHECK: %[[EMPTY:.+]] = tensor.empty +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SLICE]] +// CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [%[[INNER_TS]], %[[INNER_TS]]] into %[[EMPTY]] +func.func @fully_dynamic_unpack(%source: tensor, %dest: tensor) -> tensor { + %0 = func.call @get_dynamic_tile_size() : () -> index + %1 = linalg.unpack %source inner_dims_pos = [1, 0] inner_tiles = [%0, %0] into %dest : tensor -> tensor + return %1 : tensor +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [4, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK: func.func @perfect_NPQK_to_NKPQk +// CHECK-SAME: %[[SOURCE:.+]]: tensor<1x6x6x8xf32>, +// CHECK-SAME: %{{.+}}: tensor<1x4x6x6x2xf32>) +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK: %{{.+}} = scf.for %[[ARG2:.+]] = %[[C0]] to %[[C4]] step %[[C1]] +// CHECK: %{{.+}} = scf.for %[[ARG4:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %{{.+}} = scf.for %[[ARG6:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[APPLY:.+]] = affine.apply #[[MAP1]](%[[ARG2]]) +// CHECK: %[[SLICE_SOURCE:.+]] = tensor.extract_slice %[[SOURCE]][0, %[[ARG4]], %[[ARG6]], %[[APPLY]]] +// CHECK: %[[SLICE_DEST:.+]] = tensor.extract_slice %{{.+}}[0, %[[ARG2]], %[[ARG4]], %[[ARG6]], 0] +// CHECK: %[[PACK:.+]] = linalg.pack +// CHECK-SAME: %[[SLICE_SOURCE]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] +// CHECK-SAME: into %[[SLICE_DEST]] +// CHECK: %[[RES:.+]] = tensor.insert_slice %[[PACK]] +// CHECK-SAME: into %{{.+}}[0, %[[ARG2]], %[[ARG4]], %[[ARG6]], 0] +// CHECK: scf.yield %[[RES]] + +func.func @perfect_NPQK_to_NKPQk(%source: tensor<1x6x6x8xf32>, %dest: tensor<1x4x6x6x2xf32>) -> tensor<1x4x6x6x2xf32> { + %0 = linalg.pack %source outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %dest : tensor<1x6x6x8xf32> -> tensor<1x4x6x6x2xf32> + return %0 : tensor<1x4x6x6x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} diff --git a/mlir/test/Dialect/Linalg/transform-pack-greedily.mlir b/mlir/test/Dialect/Linalg/transform-pack-greedily.mlir index 100692426ef44..5812c4db88247 100644 --- a/mlir/test/Dialect/Linalg/transform-pack-greedily.mlir +++ b/mlir/test/Dialect/Linalg/transform-pack-greedily.mlir @@ -378,11 +378,11 @@ func.func @no_padding_on_packs(%A: tensor<32x32xf32>, %B: tensor<32x32xf32>, %C: } // CHECK-LABEL: no_padding_on_packs -// CHECK: tensor.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 4] +// CHECK: linalg.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 4] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<4x8x8x4xf32> -// CHECK: tensor.pack %{{.+}} outer_dims_perm = [1, 0] +// CHECK: linalg.pack %{{.+}} outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [4, 16] into %{{.+}} : tensor<32x32xf32> -> tensor<2x8x4x16xf32> -// CHECK: tensor.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] +// CHECK: linalg.pack %{{.+}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] // CHECK-SAME: into %{{.+}} : tensor<32x32xf32> -> tensor<4x2x8x16xf32> module attributes {transform.with_named_sequence} { @@ -393,12 +393,12 @@ module attributes {transform.with_named_sequence} { matmul_packed_sizes = [8, 16, 4] matmul_inner_dims_order = [0, 1, 2] : (!transform.op<"linalg.matmul">) -> !transform.op<"linalg.generic"> %pack = transform.get_producer_of_operand %1[1] - : (!transform.op<"linalg.generic">) -> (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.generic">) -> (!transform.op<"linalg.pack">) %2, %pack_2, %empty_unpack_2 = transform.structured.pack_transpose %pack with_compute_op(%1) outer_perm = [1, 0] inner_perm = [1, 0] - : (!transform.op<"tensor.pack">, !transform.op<"linalg.generic">) - -> (!transform.op<"linalg.generic">, !transform.op<"tensor.pack">, !transform.any_op) + : (!transform.op<"linalg.pack">, !transform.op<"linalg.generic">) + -> (!transform.op<"linalg.generic">, !transform.op<"linalg.pack">, !transform.any_op) transform.yield } } diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir index faf7ff9ad7ed0..5d4ae4f15d3fd 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir @@ -14,7 +14,7 @@ module { func.func @fuse_pack_as_producer(%src: tensor<128x256xf32>, %other: tensor<4x4x128x256xf32>) -> tensor<4x4x128x256xf32> { %dest = tensor.empty() : tensor<1x1x128x256xf32> - %pack = tensor.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256] + %pack = linalg.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %dest : tensor<128x256xf32> -> tensor<1x1x128x256xf32> %out = tensor.empty() : tensor<4x4x128x256xf32> @@ -36,10 +36,10 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { // Find and lower pack operation. - %pack = transform.structured.match ops{["tensor.pack"]} in %arg1 - : (!transform.any_op) -> !transform.op<"tensor.pack"> + %pack = transform.structured.match ops{["linalg.pack"]} in %arg1 + : (!transform.any_op) -> !transform.op<"linalg.pack"> %paded, %expanded, %transpose = transform.structured.lower_pack %pack {lowerPadLikeWithInsertSlice = false} - : (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) @@ -72,7 +72,7 @@ module { func.func @fuse_pack_as_producer_blocked_by_insert_slice(%src: tensor<128x256xf32>, %other: tensor<4x4x128x256xf32>) -> tensor<4x4x128x256xf32> { %dest = tensor.empty() : tensor<1x1x128x256xf32> - %pack = tensor.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256] + %pack = linalg.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %dest : tensor<128x256xf32> -> tensor<1x1x128x256xf32> %out = tensor.empty() : tensor<4x4x128x256xf32> @@ -94,10 +94,10 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { // Find and lower pack operation. - %pack = transform.structured.match ops{["tensor.pack"]} in %arg1 - : (!transform.any_op) -> !transform.op<"tensor.pack"> + %pack = transform.structured.match ops{["linalg.pack"]} in %arg1 + : (!transform.any_op) -> !transform.op<"linalg.pack"> %paded, %expanded, %transpose = transform.structured.lower_pack %pack - : (!transform.op<"tensor.pack">) + : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) @@ -143,7 +143,7 @@ module { } -> tensor<1x1x128x256xf32> %dest = tensor.empty() : tensor<128x256xf32> - %unpack = tensor.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256] + %unpack = linalg.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %dest : tensor<1x1x128x256xf32> -> tensor<128x256xf32> return %unpack : tensor<128x256xf32> @@ -152,10 +152,10 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { // Find and lower unpack operation. - %unpack = transform.structured.match ops{["tensor.unpack"]} in %arg1 - : (!transform.any_op) -> !transform.op<"tensor.unpack"> + %unpack = transform.structured.match ops{["linalg.unpack"]} in %arg1 + : (!transform.any_op) -> !transform.op<"linalg.unpack"> transform.structured.lower_unpack %unpack {lowerUnpadLikeWithExtractSlice = false} - : (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, @@ -204,7 +204,7 @@ module { } -> tensor<1x1x128x256xf32> %dest = tensor.empty() : tensor<128x256xf32> - %unpack = tensor.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256] + %unpack = linalg.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256] into %dest : tensor<1x1x128x256xf32> -> tensor<128x256xf32> return %unpack : tensor<128x256xf32> @@ -213,10 +213,10 @@ module { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { // Find and lower unpack operation. - %unpack = transform.structured.match ops{["tensor.unpack"]} in %arg1 - : (!transform.any_op) -> !transform.op<"tensor.unpack"> + %unpack = transform.structured.match ops{["linalg.unpack"]} in %arg1 + : (!transform.any_op) -> !transform.op<"linalg.unpack"> transform.structured.lower_unpack %unpack - : (!transform.op<"tensor.unpack">) + : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir index 8fbc74ec345c6..8f3b199145ce0 100644 --- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir @@ -115,13 +115,13 @@ module attributes {transform.with_named_sequence} { func.func @test_pack_no_vectorize_dynamic_shape(%arg0: tensor, %arg1: tensor<4x16xf32>) -> tensor<4x16xf32> { %pad = arith.constant 0.000000e+00 : f32 // expected-error @+1 {{Attempted to vectorize, but failed}} - %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [0] inner_tiles = [16] into %arg1 : tensor -> tensor<4x16xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [0] inner_tiles = [16] into %arg1 : tensor -> tensor<4x16xf32> return %pack : tensor<4x16xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir index 5ae3f893c2e73..9f2ee47b45b3e 100644 --- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir @@ -1944,13 +1944,13 @@ module attributes {transform.with_named_sequence} { // masking was used. func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { - %pack = tensor.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> + %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> return %pack : tensor<4x1x32x16x2xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op transform.yield @@ -1977,7 +1977,7 @@ module attributes {transform.with_named_sequence} { func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> } @@ -1995,7 +1995,7 @@ func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor< module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op transform.yield diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index 6d39262945de5..c6d9ec6215715 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -671,7 +671,7 @@ module attributes {transform.with_named_sequence} { // masking was used. func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { - %pack = tensor.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> + %pack = linalg.pack %arg0 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> return %pack : tensor<4x1x32x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -688,7 +688,7 @@ func.func @test_vectorize_pack(%arg0: tensor<32x8x16xf32>, %arg1: tensor<4x1x32x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [4, 1, 32] : !transform.any_op transform.yield } @@ -702,7 +702,7 @@ module attributes {transform.with_named_sequence} { func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -725,7 +725,7 @@ func.func @test_vectorize_padded_pack(%arg0: tensor<32x7x15xf32>, %arg1: tensor< module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [32, 4, 1] : !transform.any_op transform.yield } @@ -734,7 +734,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @test_vectorize_dynamic_pack(%arg0: tensor, %arg1: tensor) -> tensor { - %pack = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg1 : tensor -> tensor + %pack = linalg.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg1 : tensor -> tensor return %pack : tensor } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -766,7 +766,7 @@ func.func @test_vectorize_dynamic_pack(%arg0: tensor, %arg1: tensor !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [4, 1] : !transform.any_op transform.yield } @@ -893,12 +893,12 @@ func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor, %arg1: t // CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1> // CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]] // CHECK: return %[[write0]] - %ret = tensor.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor -> tensor + %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor -> tensor return %ret : tensor } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [4, 16] : !transform.any_op transform.yield } @@ -925,12 +925,12 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2 // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1> // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op transform.yield } @@ -949,12 +949,12 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // CHECK: %[[C00:.*]] = arith.constant 0 : index // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op transform.yield } @@ -973,12 +973,12 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // CHECK: %[[C00:.*]] = arith.constant 0 : index // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op transform.yield } @@ -988,7 +988,7 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // CHECK-LABEL: test_vectorize_pack_no_vector_sizes func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> { - %pack = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32> + %pack = linalg.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32> return %pack : tensor<2x4x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -1005,7 +1005,7 @@ func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: t module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } @@ -1016,7 +1016,7 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { %pad = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + %pack = linalg.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> return %pack : tensor<32x4x1x16x2xf32> } // CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 @@ -1033,7 +1033,7 @@ func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } @@ -1051,12 +1051,12 @@ func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, // CHECK: %[[C00:.*]] = arith.constant 0 : index // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> + %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } @@ -1075,12 +1075,12 @@ func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[EMPT]]{{\[}}%[[C00]], %[[C00]]] // CHECK-SAME: {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32> // CHECK: return %[[WRIT]] : tensor<64x127xf32> - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32> + %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32> return %0 : tensor<64x127xf32> } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } @@ -1089,7 +1089,7 @@ func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x // ----- func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> { - %0 = tensor.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32> + %0 = linalg.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32> return %0 : tensor<7x16xf32> } // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 @@ -1103,7 +1103,7 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf // CHECK: return %[[WRIT]] : tensor<7x16xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %0 = transform.structured.match ops{["linalg.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op transform.structured.vectorize %0 : !transform.any_op transform.yield } diff --git a/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir b/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir index 8496448759f0c..1d1e2840a579a 100644 --- a/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/control-flow-ops.mlir @@ -789,3 +789,15 @@ func.func @unreachable() { // expected-error @+1 {{cannot be used in reachable block}} spirv.Unreachable } + +// ----- + +//===----------------------------------------------------------------------===// +// spirv.Kill +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func @kill +func.func @kill() { + // CHECK: spirv.Kill + spirv.Kill +} diff --git a/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir b/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir index bd3c665013136..8eb48a34e61e8 100644 --- a/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir +++ b/mlir/test/Dialect/SPIRV/Transforms/inlining.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -split-input-file -pass-pipeline='builtin.module(spirv.module(inline{default-pipeline=''}))' | FileCheck %s +// RUN: mlir-opt %s --split-input-file --pass-pipeline='builtin.module(spirv.module(inline{default-pipeline=''}))' | FileCheck %s spirv.module Logical GLSL450 { spirv.func @callee() "None" { @@ -246,5 +246,24 @@ spirv.module Logical GLSL450 { } } +// ----- + +spirv.module Logical GLSL450 { + // CHECK-LABEL: @callee + spirv.func @callee() -> () "None" { + // CHECK-NEXT: spirv.Kill + spirv.Kill + } + + // CHECK-LABEL: @do_not_inline_kill + spirv.func @do_not_inline_kill() -> () "None" { + // CHECK-NOT: spirv.Kill + // CHECK-NEXT: spirv.FunctionCall @callee() : () -> () + spirv.FunctionCall @callee() : () -> () + // CHECK-NEXT: spirv.Return + spirv.Return + } +} + // TODO: Add tests for inlining structured control flow into // structured control flow. diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 01d14871072cd..90cc0ca658ffb 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -899,225 +899,6 @@ func.func @fold_extract_constant_splat() -> (tensor<4x4xi32>) { // ----- -// CHECK-LABEL: func @fold_pack_constant_splat -// CHECK-NOT: tensor.pack -// CHECK: arith.constant dense<1.000000e-01> : tensor<8x16x8x32xf32> -func.func @fold_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { - %cst = arith.constant dense<1.000000e-01> : tensor<64x128xf32> - %0 = tensor.pack %cst outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest : tensor<64x128xf32> -> tensor<8x16x8x32xf32> - return %0 : tensor<8x16x8x32xf32> -} - -// ----- - -// CHECK-LABEL: func @fold_padding_value_pack_constant_splat -// CHECK-NOT: tensor.pack -// CHECK: arith.constant dense<1.000000e-01> : tensor<8x16x8x32xf32> -func.func @fold_padding_value_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { - %pad = arith.constant 1.000000e-01 : f32 - %cst = arith.constant dense<1.000000e-01> : tensor<63x127xf32> - %0 = tensor.pack %cst - padding_value(%pad : f32) - outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest : tensor<63x127xf32> -> tensor<8x16x8x32xf32> - return %0 : tensor<8x16x8x32xf32> -} - - -// ----- - -// CHECK-LABEL: func @nofold_padding_value_pack_constant_splat -// CHECK: arith.constant dense<1.000000e-01> : tensor<63x127xf32> -// CHECK: tensor.pack -func.func @nofold_padding_value_pack_constant_splat(%dest : tensor<8x16x8x32xf32>) -> tensor<8x16x8x32xf32> { - %pad = arith.constant 0.0 : f32 - %cst = arith.constant dense<1.000000e-01> : tensor<63x127xf32> - %0 = tensor.pack %cst - padding_value(%pad : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [0, 1] - inner_tiles = [8, 32] - into %dest : tensor<63x127xf32> -> tensor<8x16x8x32xf32> - return %0 : tensor<8x16x8x32xf32> -} - -// ----- - -func.func @fold_padding_value_pack(%arg0: tensor<1200x500000xf32>) -> tensor<31250x1200x16x1xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<31250x1200x16x1xf32> - %pack = tensor.pack %arg0 - padding_value(%cst : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [1, 0] - inner_tiles = [16, 1] - into %0 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32> - return %pack : tensor<31250x1200x16x1xf32> -} -// CHECK-LABEL: func @fold_padding_value_pack -// CHECK-NOT: padding_value - -// ----- - -func.func @infer_src_shape_pack(%src: tensor, %dest: tensor<10x20x30x40x16xf32>) -> tensor<10x20x30x40x16xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %src - padding_value(%cst : f32) - outer_dims_perm = [2, 1, 3, 0] - inner_dims_pos = [2] - inner_tiles = [16] - into %dest : tensor -> tensor<10x20x30x40x16xf32> - return %pack : tensor<10x20x30x40x16xf32> -} -// CHECK-LABEL: func.func @infer_src_shape_pack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] -// CHECK: %[[CAST_SRC:.+]] = tensor.cast %[[SRC]] : tensor to tensor<40x20x?x30xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[CAST_SRC]] {{.+}} into %[[DEST]] -// CHECK: return %[[PACK]] - -// ----- - -func.func @infer_dest_shape_pack(%src: tensor<30x20x?x10xf32>, %dest: tensor) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %src - padding_value(%cst : f32) - outer_dims_perm = [2, 1, 3, 0] - inner_dims_pos = [2] - inner_tiles = [16] - into %dest : tensor<30x20x?x10xf32> -> tensor - return %pack : tensor -} -// CHECK-LABEL: func.func @infer_dest_shape_pack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] -// CHECK: %[[CAST_DEST:.+]] = tensor.cast %[[DEST]] : tensor to tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[SRC]] {{.+}} into %[[CAST_DEST]] -// CHECK: %[[CAST_PACK:.+]] = tensor.cast %[[PACK]] : tensor to tensor -// CHECK: return %[[CAST_PACK]] - -// ----- - -func.func @no_infer_pack_shape(%arg0: tensor, %arg1: index) -> tensor<32x7x?x16x1xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty(%arg1) : tensor<32x7x?x16x1xf32> - %pack = tensor.pack %arg0 padding_value(%cst : f32) outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %0 : tensor -> tensor<32x7x?x16x1xf32> - return %pack : tensor<32x7x?x16x1xf32> -} -// CHECK-LABEL: func.func @no_infer_pack_shape -// CHECK-NOT: tensor.cast - -// ----- - -func.func @fold_padding_value_pack_negative1(%arg0: tensor<1200x499999xf32>) -> tensor<31250x1200x16x1xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<31250x1200x16x1xf32> - %pack = tensor.pack %arg0 - padding_value(%cst : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [1, 0] - inner_tiles = [16, 1] - into %0 : tensor<1200x499999xf32> -> tensor<31250x1200x16x1xf32> - return %pack : tensor<31250x1200x16x1xf32> -} -// CHECK-LABEL: func @fold_padding_value_pack_negative1 -// CHECK: tensor.pack -// CHECK-SAME: padding_value - -// ----- - -func.func @fold_padding_value_pack_negative2(%arg0: tensor<1200x?xf32>, %arg1: tensor) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 - padding_value(%cst : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [1, 0] - inner_tiles = [16, 1] - into %arg1 : tensor<1200x?xf32> -> tensor - return %pack : tensor -} -// CHECK-LABEL: func @fold_padding_value_pack_negative2 -// CHECK: tensor.pack -// CHECK-SAME: padding_value - -// ----- - -func.func @fold_padding_value_pack_negative3(%arg0: tensor<1200x500000xf32>, %arg1: tensor, %tile : index) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %pack = tensor.pack %arg0 - padding_value(%cst : f32) - outer_dims_perm = [1, 0] - inner_dims_pos = [1, 0] - inner_tiles = [%tile, 1] - into %arg1 : tensor<1200x500000xf32> -> tensor - return %pack : tensor -} -// CHECK-LABEL: func @fold_padding_value_pack_negative3 -// CHECK: tensor.pack -// CHECK-SAME: padding_value - -// ----- - -// CHECK-LABEL: func @fold_unpack_constant_splat -// CHECK-NOT: tensor.unpack -// CHECK: arith.constant dense<1.000000e-01> : tensor<128x256xf32> -func.func @fold_unpack_constant_splat(%dest : tensor<128x256xf32>) -> tensor<128x256xf32> { - %cst = arith.constant dense<1.000000e-01> : tensor<16x8x8x32xf32> - %0 = tensor.unpack %cst inner_dims_pos = [0, 1] - inner_tiles = [8, 32] into %dest : tensor<16x8x8x32xf32> -> tensor<128x256xf32> - return %0 : tensor<128x256xf32> -} - -// ----- - -func.func @infer_dest_shape_unpack(%src: tensor<10x20x30x40x16xf32>, %dest: tensor) -> tensor { - %unpack = tensor.unpack %src - outer_dims_perm = [2, 1, 3, 0] - inner_dims_pos = [2] - inner_tiles = [16] - into %dest : tensor<10x20x30x40x16xf32> -> tensor - return %unpack : tensor -} -// CHECK-LABEL: func.func @infer_dest_shape_unpack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] -// CHECK: %[[CAST_DEST:.+]] = tensor.cast %[[DEST]] : tensor to tensor<40x20x?x30xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[SRC]] {{.+}} into %[[CAST_DEST]] -// CHECK: %[[CAST_UNPACK:.+]] = tensor.cast %[[UNPACK]] : tensor<40x20x?x30xf32> to tensor -// CHECK: return %[[CAST_UNPACK]] - -// ----- - -func.func @infer_src_shape_unpack(%src: tensor, %dest: tensor<30x20x?x10xf32>) -> tensor<30x20x?x10xf32> { - %unpack = tensor.unpack %src - outer_dims_perm = [2, 1, 3, 0] - inner_dims_pos = [2] - inner_tiles = [16] - into %dest : tensor -> tensor<30x20x?x10xf32> - return %unpack : tensor<30x20x?x10xf32> -} -// CHECK-LABEL: func.func @infer_src_shape_unpack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DEST:[0-9a-zA-Z]+]] -// CHECK: %[[CAST_SRC:.+]] = tensor.cast %[[SRC]] : tensor to tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[CAST_SRC]] -// CHECK: return %[[UNPACK]] - -// ----- - -func.func @no_infer_unpack_shape(%arg1: tensor<32x7x?x16x1xf32>, %arg2: index) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty(%arg2) : tensor - %unpack = tensor.unpack %arg1 outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 0] inner_tiles = [16, 1] into %0 : tensor<32x7x?x16x1xf32> -> tensor - return %unpack : tensor -} -// CHECK-LABEL: func.func @no_infer_unpack_shape -// CHECK-NOT: tensor.cast - -// ----- - - // CHECK-LABEL: func @fold_overlapping_insert // CHECK-SAME: %[[INPUT:.+]]: tensor, %{{.+}}: tensor<4x?x8xf32>, %[[SLICE2:.+]]: tensor<4x?x8xf32> func.func @fold_overlapping_insert(%input : tensor, %slice1: tensor<4x?x8xf32>, %slice2: tensor<4x?x8xf32>, %i: index, %size: index) -> (tensor) { @@ -2370,174 +2151,6 @@ func.func @collapse_expand_fold_to_cast(%t: tensor, %sz0: index) -> (tens // ----- -// Chain: NC -> NCnc -> NCnc -> NC -// CHECK: func.func @unpack_pack( -// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) -// CHECK: return %[[T]] : tensor<128x128xf32> -func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { - %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> - %packed = tensor.pack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> - %tensor_empty1 = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor<128x128xf32> - return %unpacked : tensor<128x128xf32> -} - -// ----- - -// Chain: NC -> NCcn -> NCnc -> NC -// CHECK: func.func @unpack_pack( -// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) -// CHECK-NOT: return %[[T]] : tensor<128x128xf32> -func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { - %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> - %packed = tensor.pack %t inner_dims_pos = [1, 0] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> - %tensor_empty1 = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor -<128x128xf32> - return %unpacked : tensor<128x128xf32> -} - -// ----- - -// Chain: NC -> CNcn -> NCnc -> NC -// CHECK: func.func @unpack_pack( -// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>) -// CHECK-NOT: return %[[T]] : tensor<128x128xf32> -func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> { - %tensor_empty = tensor.empty() : tensor<16x16x8x8xf32> - %packed = tensor.pack %t outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 8] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x8x8xf32> - %tensor_empty1 = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<16x16x8x8xf32> -> tensor -<128x128xf32> - return %unpacked : tensor<128x128xf32> -} - -// ----- - -// Chain: NC -> NCnc -> NCnc -> NC -// CHECK: func.func @unpack_pack( -// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>, -// CHECK: return %[[T]] : tensor<128x128xf32> -func.func @unpack_pack(%t: tensor<128x128xf32>, %tile1: index, %tile2: index) -> tensor<128x128xf32> { - %tensor_empty = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> - %packed = tensor.pack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<128x128xf32> -> tensor<16x16x?x?xf32> - %tensor_empty1 = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<16x16x?x?xf32> -> tensor -<128x128xf32> - return %unpacked : tensor<128x128xf32> -} - -// ----- - -// CHECK: func.func @unpack_pack_with_padding_no_canonicalization( -// CHECK: tensor.pack -// CHECK: tensor.unpack -func.func @unpack_pack_with_padding_no_canonicalization(%t: tensor<256x512xbf16>) -> tensor<224x512xbf16> { - %tensor_empty = tensor.empty() : tensor<4x16x64x32xbf16> - %tensor_empty1 = tensor.empty() : tensor<224x512xbf16> - %packed = tensor.pack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty : tensor<256x512xbf16> -> tensor<4x16x64x32xbf16> - %unpacked = tensor.unpack %packed inner_dims_pos = [0, 1] inner_tiles = [64, 32] into %tensor_empty1 : tensor<4x16x64x32xbf16> -> tensor<224x512xbf16> - return %unpacked : tensor<224x512xbf16> -} - -// ----- - -// Chain NCnc -> NC -> NC -> NCnc -// CHECK: func.func @pack_unpack( -// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, -// CHECK: return %[[T]] : tensor<16x16x?x?xf32> -func.func @pack_unpack(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { - %tensor_empty = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> - %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> - return %packed : tensor<16x16x?x?xf32> -} - -// ----- - -// Chain NCnc -> NC -> NC -> NCnc -// CHECK: func.func @pack_unpack( -// CHECK-SAME: %[[T:.+]]: tensor<16x16x8x8xf32> -// CHECK: return %[[T]] : tensor<16x16x8x8xf32> -func.func @pack_unpack(%t: tensor<16x16x8x8xf32>) -> tensor<16x16x8x8xf32> { - %tensor_empty = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<16x16x8x8xf32> -> tensor<128x128xf32> - %tensor_empty1 = tensor.empty() : tensor<16x16x8x8xf32> - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x8x8xf32> - return %packed : tensor<16x16x8x8xf32> -} - -// ----- - -// CHECK: func.func @pack_unpack_same_tiles( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK: return %[[T]] : tensor -func.func @pack_unpack_same_tiles(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, - %tile1: index, %tile2: index) -> tensor { - %tensor_empty = tensor.empty(%dim1, %dim2) : tensor - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor - %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor -> tensor - return %packed : tensor -} - -// ----- - -// CHECK: func.func @pack_unpack_different_tiles( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK-NOT: return %[[T]] : tensor -func.func @pack_unpack_different_tiles(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, - %tile1: index, %tile2: index) -> tensor { - %tensor_empty = tensor.empty(%dim1, %dim2) : tensor - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor - %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile2, %tile1] into %tensor_empty1 : tensor -> tensor - return %packed : tensor -} - -// ----- - -// CHECK: func.func @pack_unpack_dynamic_with_padding( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK-NOT: return %[[T]] : tensor -func.func @pack_unpack_dynamic_with_padding(%t: tensor, %dim1: index, %dim2: index, %dim3: index, %dim4: index, %dim5: index, %dim6: index, - %tile1: index, %tile2: index, %pad: f32) -> tensor { - %tensor_empty = tensor.empty(%dim1, %dim2) : tensor - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor -> tensor - %tensor_empty1 = tensor.empty(%dim3, %dim4, %dim5, %dim6) : tensor - %packed = tensor.pack %unpacked padding_value(%pad: f32) inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor -> tensor - return %packed : tensor -} - -// ----- - -// CHECK: func.func @pack_outer_dims_unpack_no_outer_dims( -// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, -// CHECK: return %[[T]] : tensor<16x16x?x?xf32> -func.func @pack_outer_dims_unpack_no_outer_dims(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { - %tensor_empty = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> - %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> - %packed = tensor.pack %unpacked outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> - return %packed : tensor<16x16x?x?xf32> -} - -// ----- - -// CHECK: func.func @pack_no_outer_dims_unpack_outer_dims( -// CHECK-SAME: %[[T:.+]]: tensor<16x16x?x?xf32>, -// CHECK: return %[[T]] : tensor<16x16x?x?xf32> -func.func @pack_no_outer_dims_unpack_outer_dims(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) -> tensor<16x16x?x?xf32> { - %tensor_empty = tensor.empty() : tensor<128x128xf32> - %unpacked = tensor.unpack %t outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty : tensor<16x16x?x?xf32> -> tensor<128x128xf32> - %tensor_empty1 = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32> - %packed = tensor.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [%tile1, %tile2] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x?x?xf32> - return %packed : tensor<16x16x?x?xf32> -} - -// ----- - // CHECK: func.func @invalid_empty_negative_size // CHECK: %[[IDX:.*]] = index.constant // CHECK: %[[T:.*]] = tensor.empty(%[[IDX]]) : tensor<4x5x?xf32> @@ -2551,22 +2164,6 @@ func.func @invalid_empty_negative_size() -> (tensor<4x5x?xf32>) { // ----- -// Fold DstStyleOp -> tensor.unpack operations. -func.func @fold_dst_style_ops_into_unpack(%arg0 : tensor, %init : tensor) -> tensor { - %cst = arith.constant 0.0 : f32 - %fill = linalg.fill ins(%cst : f32) outs(%init : tensor) -> tensor - %unpack = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 64] into %fill : tensor -> tensor - return %unpack : tensor -} -// CHECK-LABEL: func @fold_dst_style_ops_into_unpack -// CHECK-SAME: %[[ARG0:.+]]: tensor -// CHECK-SAME: %[[INIT:.+]]: tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] -// CHECK-SAME: into %[[INIT]] -// CHECK: return %[[UNPACK]] - -// ----- - // The IR in this test case in invalid. This test tests that the canonicalizer // does not crash. @@ -2598,21 +2195,6 @@ func.func @generate_negative_size_verifies() -> tensor { return %tensor : tensor } -// ----- - -func.func @infer_and_fold_pack_unpack_same_tiles(%t: tensor<10x20x4x4xf32>) -> tensor<10x20x4x4xf32> { - %dim1 = arith.constant 40 : index - %dim2 = arith.constant 80 : index - %tensor_empty = tensor.empty(%dim1, %dim2) : tensor - %unpacked = tensor.unpack %t inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %tensor_empty : tensor<10x20x4x4xf32> -> tensor - %cast = tensor.cast %unpacked : tensor to tensor<40x80xf32> - %tensor_empty1 = tensor.empty() : tensor<10x20x4x4xf32> - %packed = tensor.pack %cast inner_dims_pos = [0, 1] inner_tiles = [4, 4] into %tensor_empty1 : tensor<40x80xf32> -> tensor<10x20x4x4xf32> - return %packed : tensor<10x20x4x4xf32> -} -// CHECK-LABEL: func.func @infer_and_fold_pack_unpack_same_tiles -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK: return %[[SRC]] // ----- @@ -2787,62 +2369,6 @@ func.func @fold_cast_multiple_results(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2x return %0#1 : index } -// ----- - -// CHECK-LABEL: func.func @fold_cast_pack_dynamic_tile_size -// CHECK-SAME: %[[DEST:.*]]: tensor<1x1x8x1xi32>, -// CHECK-SAME: %[[SRC:.*]]: tensor<7x?xi32>, -// CHECK-SAME: %[[PAD:.*]]: i32) -> tensor<1x1x8x1xi32> { -// CHECK: %[[PACK:.*]] = tensor.pack %[[SRC]] padding_value(%[[PAD]] : i32) -// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] -// CHECK-SAME: test_attr -// CHECK-SAME: : tensor<7x?xi32> -> tensor<1x1x8x1xi32> -// CHECK: return %[[PACK]] : tensor<1x1x8x1xi32> -func.func @fold_cast_pack_dynamic_tile_size( - %dest: tensor<1x1x8x1xi32>, - %src: tensor<7x?xi32>, - %pad: i32) -> tensor<1x1x8x1xi32> { - - %cast = tensor.cast %dest : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> - %c8 = arith.constant 8 : index - %pack = tensor.pack %src padding_value(%pad : i32) - inner_dims_pos = [0, 1] - inner_tiles = [%c8, 1] - into %cast {test_attr} : tensor<7x?xi32> -> tensor<1x1x?x1xi32> - %res = tensor.cast %pack : tensor<1x1x?x1xi32> to tensor<1x1x8x1xi32> - return %res : tensor<1x1x8x1xi32> -} - -// ----- - -// CHECK-LABEL: func.func @fold_cast_unpack_dynamic_tile_size( -// CHECK-SAME: %[[SRC:.*]]: tensor<1x1x8x1xi32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<7x?xi32>) -> tensor<7x?xi32> { -// CHECK: %[[RES:.*]] = tensor.unpack %[[SRC]] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %[[DEST]] {test_attr} : tensor<1x1x8x1xi32> -> tensor<7x?xi32> -// CHECK: return %[[RES]] : tensor<7x?xi32> -func.func @fold_cast_unpack_dynamic_tile_size( - %src: tensor<1x1x8x1xi32>, - %res: tensor<7x?xi32>) -> tensor<7x?xi32> { - - %cast = tensor.cast %src : tensor<1x1x8x1xi32> to tensor<1x1x?x1xi32> - %c8 = arith.constant 8 : index - %unpack = tensor.unpack %cast - inner_dims_pos = [0, 1] - inner_tiles = [%c8, 1] - into %res {test_attr} : tensor<1x1x?x1xi32> -> tensor<7x?xi32> - return %unpack : tensor<7x?xi32> -} - -// ----- - -// CHECK-LABEL: func.func @pack_dont_drop_attributes( -// CHECK: tensor.pack {{.*}} {test_attr} -func.func @pack_dont_drop_attributes(%arg0: tensor, %arg1: tensor<128x?x100x16x1xf16>) -> tensor<128x?x100x16x1xf16> { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f16 - %pack = tensor.pack %arg0 padding_value(%cst : f16) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 1] into %arg1 {test_attr} : tensor -> tensor<128x?x100x16x1xf16> - return %pack : tensor<128x?x100x16x1xf16> -} // ----- diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir index 850bbcee34020..7b11c9f43c7ec 100644 --- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir +++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir @@ -61,77 +61,6 @@ func.func @rank_reducing_empty_tensor_extract(%sz : index, %idx : index) -> tens return %r: tensor<2xf32> } -func.func @pack_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> { - %empty_unpacked = tensor.empty() : tensor<256x256xf32> - %packed = tensor.pack %empty_unpacked - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32> - return %packed : tensor<8x8x32x32xf32> -} - -// CHECK-LABEL: func.func @pack_empty( -// CHECK-SAME: %[[T:.+]]: tensor<8x8x32x32xf32> -// CHECK-NOT: tensor.pack -// CHECK: return %[[T]] : tensor<8x8x32x32xf32> - -func.func @pack_empty_dynamic(%arg0: tensor, %dim0: index, %dim1: index) -> tensor { - %empty_unpacked = tensor.empty(%dim0, %dim1) : tensor - %packed = tensor.pack %empty_unpacked - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor -> tensor - return %packed : tensor -} - -// CHECK-LABEL: func.func @pack_empty_dynamic( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK-SAME: %[[DIM0:[a-zA-Z0-9_]+]]: index, -// CHECK-SAME: %[[DIM1:[a-zA-Z0-9_]+]]: index -// CHECK-NOT: tensor.pack -// CHECK: return %[[T]] : tensor - -func.func @unpack_empty(%arg0: tensor<256x256xf32>) -> tensor<256x256xf32> { - %empty_packed = tensor.empty() : tensor<8x8x32x32xf32> - %unpacked = tensor.unpack %empty_packed - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor<8x8x32x32xf32> -> tensor<256x256xf32> - return %unpacked : tensor<256x256xf32> -} - -// CHECK-LABEL: func.func @unpack_empty( -// CHECK-SAME: %[[T:.+]]: tensor<256x256xf32> -// CHECK-NOT: tensor.unpack -// CHECK: return %[[T]] : tensor<256x256xf32> - -func.func @unpack_empty_dynamic(%arg0: tensor, %dim0: index, %dim1: index) -> tensor { - %empty_packed = tensor.empty(%dim0, %dim1) : tensor - %unpacked = tensor.unpack %empty_packed - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor -> tensor - return %unpacked : tensor -} - -// CHECK-LABEL: func.func @unpack_empty_dynamic( -// CHECK-SAME: %[[T:.+]]: tensor, -// CHECK-SAME: %[[DIM0:[a-zA-Z0-9_]+]]: index, -// CHECK-SAME: %[[DIM1:[a-zA-Z0-9_]+]]: index -// CHECK-NOT: tensor.unpack -// CHECK: return %[[T]] : tensor - -func.func @pack_padded_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> { - %pad = arith.constant 1.0 : f32 - %empty_unpacked = tensor.empty() : tensor<256x256xf32> - %packed = tensor.pack %empty_unpacked - padding_value(%pad : f32) - inner_dims_pos = [0, 1] inner_tiles = [32, 32] - into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32> - return %packed : tensor<8x8x32x32xf32> -} - -// CHECK-LABEL: func.func @pack_padded_empty( -// CHECK-SAME: %[[T:.+]]: tensor<8x8x32x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack -// CHECK: return %[[PACK]] : tensor<8x8x32x32xf32> - // ----- module attributes {transform.with_named_sequence} { diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir index bff913f5f55fe..84eb60248b8be 100644 --- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir +++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir @@ -1,8 +1,8 @@ -// RUN: mlir-opt -split-input-file -test-tensor-transform-patterns=test-fold-into-pack-and-unpack %s | FileCheck %s +// RUN: mlir-opt -split-input-file -test-linalg-transform-patterns=test-fold-into-pack-and-unpack %s | FileCheck %s func.func @fold_unpack_slice(%arg0 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 : tensor -> tensor %1 = tensor.extract_slice %0[0, 0] [%arg2, %arg3] [1, 1] : tensor to tensor return %1 : tensor @@ -13,7 +13,7 @@ func.func @fold_unpack_slice(%arg0 : tensor, %arg1 : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [0, 1] inner_tiles = [8, 4] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] inner_dims_pos = [0, 1] inner_tiles = [8, 4] // CHECK-SAME: into %[[INIT]] // CHECK: return %[[UNPACK]] @@ -21,39 +21,39 @@ func.func @fold_unpack_slice(%arg0 : tensor, %arg1 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index, %arg4 : index) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 : tensor -> tensor %1 = tensor.extract_slice %0[0, %arg4] [%arg2, %arg3] [1, 1] : tensor to tensor return %1 : tensor } // CHECK-LABEL: func @nofold_unpack_slice_non_zero_offset( -// CHECK: %[[UNPACK:.+]] = tensor.unpack +// CHECK: %[[UNPACK:.+]] = linalg.unpack // CHECK: tensor.extract_slice %[[UNPACK]] // ----- func.func @nofold_unpack_slice_non_unit_stride(%arg0 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index, %arg4 : index) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 : tensor -> tensor %1 = tensor.extract_slice %0[0, 0] [%arg2, %arg3] [%arg4, 1] : tensor to tensor return %1 : tensor } // CHECK-LABEL: func @nofold_unpack_slice_non_unit_stride( -// CHECK: %[[UNPACK:.+]] = tensor.unpack +// CHECK: %[[UNPACK:.+]] = linalg.unpack // CHECK: tensor.extract_slice %[[UNPACK]] // ----- func.func @nofold_unpack_slice_rank_reduced(%arg0 : tensor, %arg1 : tensor, %arg2 : index, %arg3 : index) -> tensor { - %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 + %0 = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %arg1 : tensor -> tensor %1 = tensor.extract_slice %0[0, 0] [1, 1] [1, 1] : tensor to tensor return %1 : tensor } // CHECK-LABEL: func @nofold_unpack_slice_rank_reduced( -// CHECK: %[[UNPACK:.+]] = tensor.unpack +// CHECK: %[[UNPACK:.+]] = linalg.unpack // CHECK: tensor.extract_slice %[[UNPACK]] // ----- @@ -66,7 +66,7 @@ func.func @pad_pack(%src: tensor<16641x16xf32>) -> tensor<2082x1x8x32xf32> { tensor.yield %cst : f32 } : tensor<16641x16xf32> to tensor<16656x16xf32> %empty = tensor.empty() : tensor<2082x1x8x32xf32> - %pack = tensor.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty + %pack = linalg.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty : tensor<16656x16xf32> -> tensor<2082x1x8x32xf32> return %pack : tensor<2082x1x8x32xf32> } @@ -74,7 +74,7 @@ func.func @pad_pack(%src: tensor<16641x16xf32>) -> tensor<2082x1x8x32xf32> { // CHECK-SAME: %[[SRC:[a-zA-Z0-9]+]] // CHECK: %[[PAD_VAL:.+]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[DEST:.+]] = tensor.empty() : tensor<2082x1x8x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[SRC]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[SRC]] // CHECK-SAME: padding_value(%[[PAD_VAL]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %[[DEST]] @@ -88,13 +88,13 @@ func.func @nofold_pad_pack(%src: tensor<16641x16xf32>) -> tensor<2082x1x8x32xf32 tensor.yield %cst : f32 } : tensor<16641x16xf32> to tensor<16656x16xf32> %empty = tensor.empty() : tensor<2082x1x8x32xf32> - %pack = tensor.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty + %pack = linalg.pack %padded padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty : tensor<16656x16xf32> -> tensor<2082x1x8x32xf32> return %pack : tensor<2082x1x8x32xf32> } // CHECK-LABEL: func.func @nofold_pad_pack // CHECK: tensor.pad -// CHECK: tensor.pack +// CHECK: linalg.pack // ----- @@ -107,19 +107,19 @@ func.func @pad_pack_different_padding_value(%src: tensor<16641x16xf32>) -> tenso tensor.yield %cst0 : f32 } : tensor<16641x16xf32> to tensor<16656x16xf32> %empty = tensor.empty() : tensor<2082x1x8x32xf32> - %pack = tensor.pack %padded padding_value(%cst1 : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty + %pack = linalg.pack %padded padding_value(%cst1 : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %empty : tensor<16656x16xf32> -> tensor<2082x1x8x32xf32> return %pack : tensor<2082x1x8x32xf32> } // CHECK-LABEL: func.func @pad_pack_different_padding_value // CHECK: tensor.pad -// CHECK: tensor.pack +// CHECK: linalg.pack // ----- -func.func @tensor_pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x57x56x2x32xf32> { +func.func @linalg.pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x57x56x2x32xf32> { %0 = tensor.empty() : tensor<56x2x1x57x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [3] inner_tiles = [32] @@ -132,10 +132,10 @@ func.func @tensor_pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> t permutation = [2, 3, 0, 1, 4] return %transposed : tensor<1x57x56x2x32xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold( +// CHECK: func @linalg.pack_linalg_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -143,9 +143,9 @@ func.func @tensor_pack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> t // ----- -func.func @tensor_pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x55xf32>, %padding: f32) -> tensor<1x57x56x2x32xf32> { +func.func @linalg.pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x55xf32>, %padding: f32) -> tensor<1x57x56x2x32xf32> { %0 = tensor.empty() : tensor<56x2x1x57x32xf32> - %pack = tensor.pack %arg0 padding_value(%padding : f32) + %pack = linalg.pack %arg0 padding_value(%padding : f32) outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [3] inner_tiles = [32] @@ -158,10 +158,10 @@ func.func @tensor_pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x permutation = [2, 3, 0, 1, 4] return %transposed : tensor<1x57x56x2x32xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold_with_padding( +// CHECK: func @linalg.pack_linalg_transpose_fold_with_padding( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x55xf32>, %[[PADDING:.+]]: f32) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] padding_value(%[[PADDING]] : f32) +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] padding_value(%[[PADDING]] : f32) // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -169,9 +169,9 @@ func.func @tensor_pack_linalg_transpose_fold_with_padding(%arg0: tensor<56x57x1x // ----- -func.func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x2x56x57x32xf32> { +func.func @linalg.pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x2x56x57x32xf32> { %0 = tensor.empty() : tensor<56x57x1x2x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 inner_dims_pos = [3] inner_tiles = [32] into %0 : tensor<56x57x1x64xf32> -> tensor<56x57x1x2x32xf32> @@ -183,10 +183,10 @@ func.func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56 permutation = [2, 3, 0, 1, 4] return %transposed : tensor<1x2x56x57x32xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm( +// CHECK: func @linalg.pack_linalg_transpose_fold_no_outer_dims_perm( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x2x56x57x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 3, 0, 1] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -194,9 +194,9 @@ func.func @tensor_pack_linalg_transpose_fold_no_outer_dims_perm(%arg0: tensor<56 // ----- -func.func @tensor_pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<56x72x24x128xf32>) -> tensor<12x56x4x9x32x8x2xf32> { +func.func @linalg.pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<56x72x24x128xf32>) -> tensor<12x56x4x9x32x8x2xf32> { %0 = tensor.empty() : tensor<4x9x12x56x8x2x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 outer_dims_perm = [3, 1, 2, 0] inner_dims_pos = [1, 2, 3] inner_tiles = [8, 2, 32] @@ -209,10 +209,10 @@ func.func @tensor_pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<5 permutation = [2, 3, 0, 1, 6, 4, 5] return %transposed : tensor<12x56x4x9x32x8x2xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold_tile_dims_transpose( +// CHECK: func @linalg.pack_linalg_transpose_fold_tile_dims_transpose( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x72x24x128xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<12x56x4x9x32x8x2xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 0, 3, 1] // CHECK-SAME: inner_dims_pos = [3, 1, 2] inner_tiles = [32, 8, 2] // CHECK-SAME: into %[[INIT]] @@ -220,9 +220,9 @@ func.func @tensor_pack_linalg_transpose_fold_tile_dims_transpose(%arg0: tensor<5 // ----- -func.func @tensor_pack_linalg_transpose_fold_tile_dims_outer_dims_transpose(%arg0: tensor<56x72x24x128xf32>) -> tensor<9x56x2x12x32x8x4xf32> { +func.func @linalg.pack_linalg_transpose_fold_tile_dims_outer_dims_transpose(%arg0: tensor<56x72x24x128xf32>) -> tensor<9x56x2x12x32x8x4xf32> { %0 = tensor.empty() : tensor<4x12x9x56x8x2x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 outer_dims_perm = [3, 2, 1, 0] inner_dims_pos = [1, 2, 3] inner_tiles = [8, 2, 32] @@ -235,16 +235,16 @@ func.func @tensor_pack_linalg_transpose_fold_tile_dims_outer_dims_transpose(%arg permutation = [2, 3, 5, 1, 6, 4, 0] return %transposed : tensor<9x56x2x12x32x8x4xf32> } -// CHECK: func @tensor_pack_linalg_transpose_fold_tile_dims_outer_dims_transpose( +// CHECK: func @linalg.pack_linalg_transpose_fold_tile_dims_outer_dims_transpose( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x72x24x128xf32>) -// CHECK: tensor.pack +// CHECK: linalg.pack // CHECK: linalg.transpose // ----- -func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56x?x?x64xf32>) -> tensor { +func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56x?x?x64xf32>) -> tensor { %0 = tensor.empty() : tensor<56x2x1x57x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 outer_dims_perm = [0, 3, 2, 1] inner_dims_pos = [3] inner_tiles = [32] @@ -259,14 +259,14 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56 %return_value = tensor.cast %transposed : tensor<1x57x56x2x32xf32> to tensor return %return_value : tensor } -// CHECK: func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims( +// CHECK: func @linalg.pack_linalg_transpose_fold_dynamic_outer_dims( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x?x?x64xf32>) // CHECK-DAG: %[[c1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[c2:.+]] = arith.constant 2 : index // CHECK: %[[dim:.+]] = tensor.dim %[[ARG0]], %[[c1]] : tensor<56x?x?x64xf32> // CHECK: %[[dim_0:.+]] = tensor.dim %[[ARG0]], %[[c2]] : tensor<56x?x?x64xf32> // CHECK: %[[INIT:.+]] = tensor.empty(%[[dim_0]], %[[dim]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -274,9 +274,9 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims(%arg0: tensor<56 // ----- -func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0: tensor<56x?x?x128xf32>) -> tensor { +func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0: tensor<56x?x?x128xf32>) -> tensor { %0 = tensor.empty() : tensor<56x9x12x4x8x2x32xf32> - %pack = tensor.pack %arg0 + %pack = linalg.pack %arg0 inner_dims_pos = [1, 2, 3] inner_tiles = [8, 2, 32] into %0 : tensor<56x?x?x128xf32> -> tensor<56x9x12x4x8x2x32xf32> @@ -292,7 +292,7 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0: } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> -// CHECK-LABEL: func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims( +// CHECK-LABEL: func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_and_tile_dims( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x?x?x128xf32>) // CHECK-DAG: %[[c1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[c2:.+]] = arith.constant 2 : index @@ -301,15 +301,15 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_and_tile_dims(%arg0: // CHECK: %[[mapped_dim1:.+]] = affine.apply #[[$MAP0]]()[%[[dim]]] // CHECK: %[[mapped_dim2:.+]] = affine.apply #[[$MAP1]]()[%[[dim_0]]] // CHECK: %[[INIT:.+]] = tensor.empty(%[[mapped_dim2]], %[[mapped_dim1]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [2, 3, 0, 1] inner_dims_pos = [3, 1, 2] inner_tiles = [32, 8, 2] into %[[INIT]] : tensor<56x?x?x128xf32> -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [2, 3, 0, 1] inner_dims_pos = [3, 1, 2] inner_tiles = [32, 8, 2] into %[[INIT]] : tensor<56x?x?x128xf32> -> tensor // CHECK: %[[CAST:.+]] = tensor.cast %[[PACK]] : tensor to tensor // CHECK: return %[[CAST]] : tensor // CHECK: } // ----- -func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %pack_dest: tensor, %transpose_dest: tensor, %tile_p : index, %tile_q : index, %tile_r : index) -> tensor { - %pack = tensor.pack %arg0 +func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %pack_dest: tensor, %transpose_dest: tensor, %tile_p : index, %tile_q : index, %tile_r : index) -> tensor { + %pack = linalg.pack %arg0 outer_dims_perm = [3, 0, 2, 1] inner_dims_pos = [1, 2, 3] inner_tiles = [%tile_p, %tile_q, %tile_r] @@ -324,7 +324,7 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_s } // CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> // CHECK: module { -// CHECK: func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_sizes( +// CHECK: func.func @linalg.pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_sizes( // CHECK-SAME: %[[ARG0:.+]]: tensor, // CHECK-SAME: %[[PACK_DEST:.+]]: tensor, %[[TRANSPOSE_DEST:.+]]: tensor, // CHECK-SAME: %[[ARG1:.+]]: index, %[[ARG2:.+]]: index, @@ -341,13 +341,13 @@ func.func @tensor_pack_linalg_transpose_fold_dynamic_outer_dims_tile_dims_tile_s // CHECK: %[[mapped_dim1:.+]] = affine.apply #[[$MAP]]()[%[[dim_0]], %[[ARG1]]] // CHECK: %[[mapped_dim2:.+]] = affine.apply #[[$MAP]]()[%[[dim_1]], %[[ARG2]]] // CHECK: %[[INIT:.+]] = tensor.empty(%[[mapped_dim2]], %[[mapped_dim1]], %[[mapped_dim0]], %[[dim]], %[[ARG3]], %[[ARG1]], %[[ARG2]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [2, 1, 3, 0] inner_dims_pos = [3, 1, 2] inner_tiles = [%[[ARG3]], %[[ARG1]], %[[ARG2]]] into %[[INIT]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [2, 1, 3, 0] inner_dims_pos = [3, 1, 2] inner_tiles = [%[[ARG3]], %[[ARG1]], %[[ARG2]]] into %[[INIT]] : tensor -> tensor // CHECK: return %[[PACK]] : tensor // CHECK: } // ----- -func.func @linalg_transpose_tensor_pack_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x57x56x2x32xf32> { +func.func @linalg_transpose_linalg.pack_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x57x56x2x32xf32> { %0 = tensor.empty() : tensor<1x56x57x64xf32> %transposed = linalg.transpose ins(%arg0 : tensor<56x57x1x64xf32>) @@ -355,17 +355,17 @@ func.func @linalg_transpose_tensor_pack_fold(%arg0: tensor<56x57x1x64xf32>) -> t permutation = [2, 0, 1, 3] %1 = tensor.empty() : tensor<1x57x56x2x32xf32> - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed outer_dims_perm = [0, 2, 1, 3] inner_dims_pos = [3] inner_tiles = [32] into %1 : tensor<1x56x57x64xf32> -> tensor<1x57x56x2x32xf32> return %pack : tensor<1x57x56x2x32xf32> } -//CHECK-LABEL: func @linalg_transpose_tensor_pack_fold( +//CHECK-LABEL: func @linalg_transpose_linalg.pack_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -373,7 +373,7 @@ func.func @linalg_transpose_tensor_pack_fold(%arg0: tensor<56x57x1x64xf32>) -> t // ----- -func.func @linalg_transpose_tensor_pack_fold_with_padding(%arg0: tensor<56x57x1x55xf32>, %padding: f32) -> tensor<1x57x56x2x32xf32> { +func.func @linalg_transpose_linalg.pack_fold_with_padding(%arg0: tensor<56x57x1x55xf32>, %padding: f32) -> tensor<1x57x56x2x32xf32> { %0 = tensor.empty() : tensor<1x56x57x55xf32> %transpose = linalg.transpose ins(%arg0 : tensor<56x57x1x55xf32>) @@ -381,17 +381,17 @@ func.func @linalg_transpose_tensor_pack_fold_with_padding(%arg0: tensor<56x57x1x permutation = [2, 0, 1, 3] %1 = tensor.empty() : tensor<1x57x56x2x32xf32> - %pack = tensor.pack %transpose padding_value(%padding : f32) + %pack = linalg.pack %transpose padding_value(%padding : f32) outer_dims_perm = [0, 2, 1, 3] inner_dims_pos = [3] inner_tiles = [32] into %1 : tensor<1x56x57x55xf32> -> tensor<1x57x56x2x32xf32> return %pack : tensor<1x57x56x2x32xf32> } -//CHECK-LABEL: func @linalg_transpose_tensor_pack_fold_with_padding( +//CHECK-LABEL: func @linalg_transpose_linalg.pack_fold_with_padding( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x55xf32>, %[[PADDING:.+]]: f32) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x57x56x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] padding_value(%[[PADDING]] : f32) +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] padding_value(%[[PADDING]] : f32) // CHECK-SAME: outer_dims_perm = [2, 1, 0, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -399,7 +399,7 @@ func.func @linalg_transpose_tensor_pack_fold_with_padding(%arg0: tensor<56x57x1x // ----- -func.func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x56x57x2x32xf32> { +func.func @linalg_transpose_linalg.pack_fold_no_outer_dims_perm(%arg0: tensor<56x57x1x64xf32>) -> tensor<1x56x57x2x32xf32> { %0 = tensor.empty() : tensor<1x56x57x64xf32> %transposed = linalg.transpose ins(%arg0 : tensor<56x57x1x64xf32>) @@ -407,16 +407,16 @@ func.func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm(%arg0: tensor<56 permutation = [2, 0, 1, 3] %1 = tensor.empty() : tensor<1x56x57x2x32xf32> - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed inner_dims_pos = [3] inner_tiles = [32] into %1 : tensor<1x56x57x64xf32> -> tensor<1x56x57x2x32xf32> return %pack : tensor<1x56x57x2x32xf32> } -//CHECK-LABEL: func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm( +//CHECK-LABEL: func @linalg_transpose_linalg.pack_fold_no_outer_dims_perm( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x56x57x2x32xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 0, 1, 3] // CHECK-SAME: inner_dims_pos = [3] inner_tiles = [32] // CHECK-SAME: into %[[INIT]] @@ -424,25 +424,25 @@ func.func @linalg_transpose_tensor_pack_fold_no_outer_dims_perm(%arg0: tensor<56 // ----- -func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change(%arg0: tensor<25x30x35x40xf32>, %transpose_dest: tensor<35x40x25x30xf32>, %pack_dest: tensor<3x35x5x8x5x10x5xf32>) -> tensor<3x35x5x8x5x10x5xf32> { +func.func @linalg_transpose_linalg.pack_fold_complex_inner_dims_change(%arg0: tensor<25x30x35x40xf32>, %transpose_dest: tensor<35x40x25x30xf32>, %pack_dest: tensor<3x35x5x8x5x10x5xf32>) -> tensor<3x35x5x8x5x10x5xf32> { %transposed = linalg.transpose ins(%arg0 : tensor<25x30x35x40xf32>) outs(%transpose_dest : tensor<35x40x25x30xf32>) permutation = [2, 3, 0, 1] - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed outer_dims_perm = [3, 0, 2, 1] inner_dims_pos = [1, 3, 2] inner_tiles = [5, 10, 5] into %pack_dest : tensor<35x40x25x30xf32> -> tensor<3x35x5x8x5x10x5xf32> return %pack : tensor<3x35x5x8x5x10x5xf32> } -//CHECK-LABEL: func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change( +//CHECK-LABEL: func.func @linalg_transpose_linalg.pack_fold_complex_inner_dims_change( // CHECK-SAME: %[[ARG0:.+]]: tensor<25x30x35x40xf32>, // CHECK-SAME: %[[ARG1:.+]]: tensor<35x40x25x30xf32>, // CHECK-SAME: %[[ARG2:.+]]: tensor<3x35x5x8x5x10x5xf32>) -> tensor<3x35x5x8x5x10x5xf32> { // CHECK: %[[VAL0:.+]] = tensor.empty() : tensor<3x35x5x8x5x10x5xf32> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 2, 0, 3] // CHECK-SAME: inner_dims_pos = [3, 1, 0] // CHECK-SAME: inner_tiles = [5, 10, 5] @@ -451,13 +451,13 @@ func.func @linalg_transpose_tensor_pack_fold_complex_inner_dims_change(%arg0: te // ----- -func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %pack_dest: tensor, %tile_p : index, %tile_q : index, %tile_r : index) -> tensor { +func.func @linalg_transpose_linalg.pack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %pack_dest: tensor, %tile_p : index, %tile_q : index, %tile_r : index) -> tensor { %transposed = linalg.transpose ins(%arg0 : tensor) outs(%transpose_dest : tensor) permutation = [2, 3, 0, 1] - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed outer_dims_perm = [3, 0, 2, 1] inner_dims_pos = [1, 3, 2] inner_tiles = [%tile_p, %tile_q, %tile_r] @@ -465,7 +465,7 @@ func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_s return %pack : tensor } // CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> -//CHECK-LABEL: func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_sizes( +//CHECK-LABEL: func.func @linalg_transpose_linalg.pack_fold_dynamic_outer_dims_tile_dims_tile_sizes( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor, // CHECK-SAME: %[[ARG2:.+]]: tensor, %[[ARG3:.+]]: index, %[[ARG4:.+]]: index, %[[ARG5:.+]]: index) -> tensor { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index @@ -480,12 +480,12 @@ func.func @linalg_transpose_tensor_pack_fold_dynamic_outer_dims_tile_dims_tile_s // CHECK: %[[VAL1:.+]] = affine.apply #[[$MAP]]()[%[[DIM0]], %[[ARG4]]] // CHECK: %[[VAL2:.+]] = affine.apply #[[$MAP]]()[%[[DIM]], %[[ARG5]]] // CHECK: %[[VAL3:.+]] = tensor.empty(%[[VAL1]], %[[DIM1]], %[[VAL2]], %[[VAL0]], %[[ARG3]], %[[ARG4]], %[[ARG5]]) : tensor -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] outer_dims_perm = [1, 2, 0, 3] inner_dims_pos = [3, 1, 0] inner_tiles = [%[[ARG3]], %[[ARG4]], %[[ARG5]]] into %[[VAL3]] : tensor -> tensor +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] outer_dims_perm = [1, 2, 0, 3] inner_dims_pos = [3, 1, 0] inner_tiles = [%[[ARG3]], %[[ARG4]], %[[ARG5]]] into %[[VAL3]] : tensor -> tensor // CHECK: return %[[PACK]] : tensor // ----- -func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor) -> tensor<32x?x64x16x2xbf16> { +func.func @linalg_transpose_linalg.pack_multiple_tiles(%arg0: tensor) -> tensor<32x?x64x16x2xbf16> { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : bf16 %dim = tensor.dim %arg0, %c0 : tensor @@ -497,7 +497,7 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor - %pack = tensor.pack %transposed + %pack = linalg.pack %transposed padding_value(%cst : bf16) outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] @@ -506,14 +506,14 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor } // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> -//CHECK-LABEL: func.func @linalg_transpose_tensor_pack_multiple_tiles( +//CHECK-LABEL: func.func @linalg_transpose_linalg.pack_multiple_tiles( // CHECK-SAME: %[[ARG0:.+]]: tensor) -> tensor<32x?x64x16x2xbf16> { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : bf16 // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor // CHECK: %[[VAL0:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]] // CHECK: %[[VAL1:.+]] = tensor.empty(%[[VAL0]]) : tensor<32x?x64x16x2xbf16> -// CHECK: %[[PACK:.+]] = tensor.pack %[[ARG0]] +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG0]] // CHECK-SAME: padding_value(%[[CST]] : bf16) // CHECK-SAME: outer_dims_perm = [1, 0, 2] // CHECK-SAME: inner_dims_pos = [0, 2] @@ -524,23 +524,23 @@ func.func @linalg_transpose_tensor_pack_multiple_tiles(%arg0: tensor) -> tensor<16x4xi32> { +func.func @linalg_transpose_linalg.unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> tensor<16x4xi32> { %0 = tensor.empty() : tensor<1x1x16x4xi32> %transposed = linalg.transpose ins(%arg0 : tensor<1x1x4x16xi32>) outs(%0 : tensor<1x1x16x4xi32>) permutation = [1, 0, 3, 2] %1 = tensor.empty() : tensor<16x4xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 4] into %1 : tensor<1x1x16x4xi32> -> tensor<16x4xi32> return %unpack : tensor<16x4xi32> } -//CHECK-LABEL: func.func @linalg_transpose_tensor_unpack_fold( +//CHECK-LABEL: func.func @linalg_transpose_linalg.unpack_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<1x1x4x16xi32>) -> tensor<16x4xi32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<16x4xi32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [1, 0] // CHECK-SAME: inner_tiles = [4, 16] @@ -550,23 +550,23 @@ func.func @linalg_transpose_tensor_unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> t // ----- -func.func @linalg_transpose_tensor_unpack_fold_partial_tile(%arg0: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> { +func.func @linalg_transpose_linalg.unpack_fold_partial_tile(%arg0: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> { %0 = tensor.empty() : tensor<1x1x16x4xi32> %transposed = linalg.transpose ins(%arg0 : tensor<1x1x4x16xi32>) outs(%0 : tensor<1x1x16x4xi32>) permutation = [1, 0, 3, 2] %1 = tensor.empty() : tensor<15x3xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 4] into %1 : tensor<1x1x16x4xi32> -> tensor<15x3xi32> return %unpack : tensor<15x3xi32> } -//CHECK-LABEL: func.func @linalg_transpose_tensor_unpack_fold_partial_tile( +//CHECK-LABEL: func.func @linalg_transpose_linalg.unpack_fold_partial_tile( // CHECK-SAME: %[[ARG0:.+]]: tensor<1x1x4x16xi32>) -> tensor<15x3xi32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<15x3xi32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [1, 0] // CHECK-SAME: inner_tiles = [4, 16] @@ -576,20 +576,20 @@ func.func @linalg_transpose_tensor_unpack_fold_partial_tile(%arg0: tensor<1x1x4x // ----- -func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %unpack_dest: tensor, %tile_p : index, %tile_q : index) -> tensor { +func.func @linalg_transpose_linalg.unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes(%arg0: tensor, %transpose_dest: tensor, %unpack_dest: tensor, %tile_p : index, %tile_q : index) -> tensor { %transposed = linalg.transpose ins(%arg0 : tensor) outs(%transpose_dest : tensor) permutation = [1, 0, 3, 2] - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [%tile_p, %tile_q] into %unpack_dest : tensor -> tensor return %unpack : tensor } -// CHECK-LABEL: func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes( +// CHECK-LABEL: func.func @linalg_transpose_linalg.unpack_fold_dynamic_outer_dims_tile_dims_tile_sizes( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor, %[[ARG2:.+]]: tensor, // CHECK-SAME: %[[IDX1:.+]]: index, %[[IDX2:.+]]: index) -> tensor { // CHECK-DAG: %[[CST1:.+]] = arith.constant 1 : index @@ -597,7 +597,7 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile // CHECK-DAG: %[[DIM0:.+]] = tensor.dim %[[ARG2]], %[[CST0]] : tensor // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG2]], %[[CST1]] : tensor // CHECK: %[[OUT:.+]] = tensor.empty(%[[DIM0]], %[[DIM1]]) : tensor -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 1] // CHECK-SAME: inner_dims_pos = [1, 0] // CHECK-SAME: inner_tiles = [%[[IDX2]], %[[IDX1]]] @@ -607,9 +607,9 @@ func.func @linalg_transpose_tensor_unpack_fold_dynamic_outer_dims_tile_dims_tile // ----- -func.func @tensor_unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<3648x56xf32> { +func.func @linalg.unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> tensor<3648x56xf32> { %0 = tensor.empty() : tensor<56x3648xf32> - %pack = tensor.unpack %arg0 + %pack = linalg.unpack %arg0 outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [1, 64] @@ -622,10 +622,10 @@ func.func @tensor_unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> permutation = [1,0] return %transposed : tensor<3648x56xf32> } -// CHECK-LABEL: func.func @tensor_unpack_linalg_transpose_fold( +// CHECK-LABEL: func.func @linalg.unpack_linalg_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<56x57x1x64xf32>) -> tensor<3648x56xf32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<3648x56xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0] // CHECK-SAME: inner_dims_pos = [1, 0] // CHECK-SAME: inner_tiles = [1, 64] @@ -637,7 +637,7 @@ func.func @tensor_unpack_linalg_transpose_fold(%arg0: tensor<56x57x1x64xf32>) -> func.func @tensor_padded_unpack_linalg_transpose_fold(%arg0: tensor<71x7x4x16x16xf32>) -> tensor<100x71x64xf32> { %0 = tensor.empty() : tensor<71x100x64xf32> - %pack = tensor.unpack %arg0 + %pack = linalg.unpack %arg0 inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %0 : tensor<71x7x4x16x16xf32> -> tensor<71x100x64xf32> @@ -652,7 +652,7 @@ func.func @tensor_padded_unpack_linalg_transpose_fold(%arg0: tensor<71x7x4x16x16 // CHECK-LABEL: func.func @tensor_padded_unpack_linalg_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<71x7x4x16x16xf32>) -> tensor<100x71x64xf32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<100x71x64xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [1, 0, 2] // CHECK-SAME: inner_dims_pos = [0, 2] // CHECK-SAME: inner_tiles = [16, 16] @@ -668,7 +668,7 @@ func.func @non_involution_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) - outs(%0 : tensor<5x2x3x16x4xi32>) permutation = [2, 0, 1, 4, 3] %1 = tensor.empty() : tensor<5x48x8xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [0, 2, 1] inner_dims_pos = [1, 2] inner_tiles = [16, 4] into @@ -678,7 +678,7 @@ func.func @non_involution_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) - //CHECK-LABEL: func.func @non_involution_transpose_unpack_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x5x4x16xi32>) -> tensor<5x48x8xi32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<5x48x8xi32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0] // CHECK-SAME: inner_dims_pos = [2, 1] // CHECK-SAME: inner_tiles = [4, 16] @@ -690,7 +690,7 @@ func.func @non_involution_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) - func.func @unpack_non_involution_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { %0 = tensor.empty() : tensor<3x56x3648xf32> - %unpack = tensor.unpack %arg0 + %unpack = linalg.unpack %arg0 outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [1, 64] @@ -706,7 +706,7 @@ func.func @unpack_non_involution_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) // CHECK-LABEL: func.func @unpack_non_involution_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<3648x3x56xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] // CHECK-SAME: inner_dims_pos = [2, 0] // CHECK-SAME: inner_tiles = [1, 64] @@ -722,7 +722,7 @@ func.func @transpose_unpacked_dims_no_fold(%arg0: tensor<2x16x5x4x3xi32>) -> ten outs(%0 : tensor<5x2x3x16x4xi32>) permutation = [2, 0, 4, 1, 3] %1 = tensor.empty() : tensor<5x32x12xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed inner_dims_pos = [1, 2] inner_tiles = [16, 4] into %1 : tensor<5x2x3x16x4xi32> -> tensor<5x32x12xi32> @@ -730,7 +730,7 @@ func.func @transpose_unpacked_dims_no_fold(%arg0: tensor<2x16x5x4x3xi32>) -> ten } //CHECK-LABEL: func.func @transpose_unpacked_dims_no_fold( // CHECK: linalg.transpose -// CHECK: tensor.unpack +// CHECK: linalg.unpack // ----- @@ -747,7 +747,7 @@ func.func @generic_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) -> tenso linalg.yield %in : i32 } -> tensor<5x2x3x16x4xi32> %1 = tensor.empty() : tensor<5x48x8xi32> - %unpack = tensor.unpack %transposed + %unpack = linalg.unpack %transposed outer_dims_perm = [0, 2, 1] inner_dims_pos = [1, 2] inner_tiles = [16, 4] into @@ -757,7 +757,7 @@ func.func @generic_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) -> tenso //CHECK-LABEL: func.func @generic_transpose_unpack_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x5x4x16xi32>) -> tensor<5x48x8xi32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<5x48x8xi32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [2, 1, 0] // CHECK-SAME: inner_dims_pos = [2, 1] // CHECK-SAME: inner_tiles = [4, 16] @@ -771,7 +771,7 @@ func.func @generic_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) -> tenso #map1 = affine_map<(d0, d1, d2)->(d0, d1, d2)> func.func @unpack_generic_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { %0 = tensor.empty() : tensor<3x56x3648xf32> - %unpack = tensor.unpack %arg0 + %unpack = linalg.unpack %arg0 outer_dims_perm = [2, 0, 1] inner_dims_pos = [1, 2] inner_tiles = [1, 64] @@ -791,7 +791,7 @@ func.func @unpack_generic_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) -> ten // CHECK-LABEL: func.func @unpack_generic_transpose_fold( // CHECK-SAME: %[[ARG0:.+]]: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { // CHECK: %[[OUT:.+]] = tensor.empty() : tensor<3648x3x56xf32> -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ARG0]] // CHECK-SAME: outer_dims_perm = [0, 1, 2] // CHECK-SAME: inner_dims_pos = [2, 0] // CHECK-SAME: inner_tiles = [1, 64] diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir index 0c6d8f4e05c33..654169841c1c1 100644 --- a/mlir/test/Dialect/Tensor/invalid.mlir +++ b/mlir/test/Dialect/Tensor/invalid.mlir @@ -635,181 +635,6 @@ func.func @empty_wrong_number_of_operands(%sz : index) { // ----- -func.func @pack_invalid_no_padding_no_full_tiles(%input: tensor<256x128xf32>, %output: tensor<8x8x16x33xf32>) -> tensor<8x8x16x33xf32> { - // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} - %0 = tensor.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 33] into %output : tensor<256x128xf32> -> tensor<8x8x16x33xf32> - return %0 : tensor<8x8x16x33xf32> -} - -// ----- - -func.func @pack_invalid_no_padding_no_full_tiles_dyn_tiles(%input: tensor<256x128xf32>, %output: tensor<10x8x?x?xf32>, %tile_size_0: index, %tile_size_1: index) -> tensor<10x8x?x?xf32> { - // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} - %0 = tensor.pack %input inner_dims_pos = [1, 0] inner_tiles = [%tile_size_0, %tile_size_1] into %output : tensor<256x128xf32> -> tensor<10x8x?x?xf32> - return %0 : tensor<10x8x?x?xf32> -} - -// ----- - -func.func @pack_invalid_no_padding_no_full_tiles_dyn_tiles_outperm(%input: tensor<256x128xf32>, %output: tensor<8x10x?x?xf32>, %tile_size_0: index, %tile_size_1: index) -> tensor<8x10x?x?xf32> { - // expected-error@+1 {{invalid tile factor or output size provided. Only full tiles are supported when padding_value is not set}} - %0 = tensor.pack %input outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [%tile_size_0, %tile_size_1] into %output : tensor<256x128xf32> -> tensor<8x10x?x?xf32> - return %0 : tensor<8x10x?x?xf32> -} - -// ----- - -func.func @pad_and_pack_invalid_type(%input: tensor<13x15xf32>, %output: tensor<2x8x8x2xf32>, %pad: i32) -> tensor<2x8x8x2xf32> { - // expected-error@+1 {{expected padding_value has 'f32' but got: 'i32'}} - %0 = tensor.pack %input padding_value(%pad: i32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<13x15xf32> -> tensor<2x8x8x2xf32> - return %0 : tensor<2x8x8x2xf32> -} - -// ----- - -func.func @pack_invalid_inner_dims_pos_vector(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid inner_dims_pos vector}} - %0 = tensor.pack %input inner_dims_pos = [2, 0] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- - -func.func @pack_invalid_duplicate_element_in_inner_dims(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid inner_dims_pos vector}} - %0 = tensor.pack %input inner_dims_pos = [1, 1] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- - -func.func @pack_invalid_duplicate_element_in_outer_perm(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid outer_dims_perm vector}} - %0 = tensor.pack %input outer_dims_perm = [1, 1] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- - -func.func @pack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<64x32x16xf32> { - // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} - %0 = tensor.pack %input inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %output : tensor<256x128xf32> -> tensor<64x32x16xf32> - return %0 : tensor<64x32x16xf32> -} - -// ----- - -func.func @unpack_invalid_output_rank(%input: tensor<256x128xf32>, %output: tensor<64x32x16xf32>) -> tensor<256x128xf32> { - // expected-error@+1 {{packed rank != (unpacked rank + num tiling factors), got 3 != 4}} - %0 = tensor.unpack %output inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %input : tensor<64x32x16xf32> -> tensor<256x128xf32> - return %0 : tensor<256x128xf32> -} - -// ----- - -func.func @unpack_invalid_out_of_bound_outer_perm(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid outer_dims_perm vector}} - %0 = tensor.unpack %output outer_dims_perm = [2, 1] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %input : tensor<8x8x32x16xf32> -> tensor<256x128xf32> - return %0 : tensor<256x128xf32> -} - -// ----- - -func.func @pack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<16x4x32x16xf32> { - // expected-error@+1 {{outer_dims_perm must be a permutation or empty}} - %0 = tensor.pack %source outer_dims_perm = [0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<16x4x32x16xf32> - return %0 : tensor<16x4x32x16xf32> -} - -// ----- - -func.func @unpack_invalid_outer_dims_perm(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<128x256xf32> { - // expected-error@+1 {{outer_dims_perm must be a permutation or empty}} - %0 = tensor.unpack %dest outer_dims_perm = [1] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<16x4x32x16xf32> -> tensor<128x256xf32> - return %0 : tensor<128x256xf32> -} - -// ----- - -func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x8x16x32xf32>', got 'tensor<8x8x32x16xf32>'}} - %0 = tensor.pack %input inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- - -func.func @unpack_invalid(%output: tensor<256x128xf32>, %input: tensor<8x8x32x16xf32>) -> tensor<256x128xf32> { - // expected-error@+1 {{the shape of output is not large enough to hold the packed data. Expected at least 'tensor<8x32x4x32xf32>', got 'tensor<8x8x32x16xf32>'}} - %0 = tensor.unpack %input inner_dims_pos = [1, 0] inner_tiles = [4, 32] into %output : tensor<8x8x32x16xf32> -> tensor<256x128xf32> - return %0 : tensor<256x128xf32> -} - -// ----- - -func.func @pack_invalid(%input: tensor<256x128xf32>, %output: tensor<8x8x32x16xf32>) -> tensor<8x8x32x16xf32> { - // expected-error@+1 {{invalid zero tile factor}} - %0 = tensor.pack %input inner_dims_pos = [1, 0] inner_tiles = [0, 2] into %output : tensor<256x128xf32> -> tensor<8x8x32x16xf32> - return %0 : tensor<8x8x32x16xf32> -} - -// ----- -func.func @pack_mismatch_inner_tile_size_and_output_shape( - %input : tensor, %output : tensor) -> tensor { - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @pack_dynamic_inner_tile_size_and_static_output_shape( - %input : tensor, %output : tensor) -> tensor { - %c8 = arith.constant 8 : index - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, %c8] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @pack_static_inner_tile_size_and_dynamic_output_shape( - %input : tensor, %output : tensor) -> tensor { - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.pack %input inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @unpack_mismatch_inner_tile_size_and_output_shape( - %input : tensor, %output : tensor) -> tensor { - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @unpack_dynamic_inner_tile_size_and_static_output_shape( - %input : tensor, %output : tensor) -> tensor { - %c8 = arith.constant 8 : index - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [%c8, 4] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - -func.func @unpack_static_inner_tile_size_and_dynamic_output_shape( - %input : tensor, %output : tensor) -> tensor { - // expected-error@+1 {{mismatch in inner tile sizes specified and shaped of tiled dimension in the packed type}} - %0 = tensor.unpack %input inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %output : tensor -> tensor - return %0 : tensor -} - -// ----- - func.func @bitcast_index_0(%arg0 : tensor) -> tensor { // expected-error @+1 {{'tensor.bitcast' op result #0 must be tensor of signless integer or unsigned integer or signed integer or floating-point values, but got 'tensor'}} %0 = tensor.bitcast %arg0 : tensor to tensor diff --git a/mlir/test/Dialect/Tensor/ops.mlir b/mlir/test/Dialect/Tensor/ops.mlir index 378137a14b59f..930986211cb6d 100644 --- a/mlir/test/Dialect/Tensor/ops.mlir +++ b/mlir/test/Dialect/Tensor/ops.mlir @@ -358,106 +358,3 @@ func.func @gather_scatter( (tensor<1x3x4xf32>, tensor<4x5x6xf32>, tensor<1x3x2xi32>) -> tensor<4x5x6xf32> return } - -// ----- - -func.func @pack_nc_to_ncnc(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>) -> tensor<128x256xf32> { - %0 = tensor.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> - %1 = tensor.empty() : tensor<128x256xf32> - %2 = tensor.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<4x16x32x16xf32> -> tensor<128x256xf32> - return %2 : tensor<128x256xf32> -} - -// CHECK-LABEL: func.func @pack_nc_to_ncnc( -// CHECK-SAME: %[[SOURCE:.*]]: tensor<128x256xf32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<4x16x32x16xf32>) -// CHECK: %[[PACKED:.*]] = tensor.pack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[DEST]] : tensor<128x256xf32> -> tensor<4x16x32x16xf32> -// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<128x256xf32> -// CHECK: %{{.*}} = tensor.unpack %[[PACKED]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[BUFF]] : tensor<4x16x32x16xf32> -> tensor<128x256xf32> - -// ----- - -func.func @pack_nc_to_ncnc_with_padding(%source: tensor<13x15xf32>, %dest: tensor<2x8x8x2xf32>, %padding: f32) -> tensor<13x15xf32> { - %0 = tensor.pack %source padding_value(%padding : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor<13x15xf32> -> tensor<2x8x8x2xf32> - %1 = tensor.empty() : tensor<13x15xf32> - %2 = tensor.unpack %0 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %1 : tensor<2x8x8x2xf32> -> tensor<13x15xf32> - return %2 : tensor<13x15xf32> -} - -// CHECK-LABEL: func.func @pack_nc_to_ncnc_with_padding( -// CHECK-SAME: %[[SOURCE:.*]]: tensor<13x15xf32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<2x8x8x2xf32>, -// CHECK-SAME: %[[PADDING:.*]]: f32) -// CHECK: %[[PACKED:.*]] = tensor.pack %[[SOURCE]] padding_value(%[[PADDING]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor<13x15xf32> -> tensor<2x8x8x2xf32> -// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<13x15xf32> -// CHECK: %{{.*}} = tensor.unpack %[[PACKED]] inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[BUFF]] : tensor<2x8x8x2xf32> -> tensor<13x15xf32> - -// ----- - -func.func @pack_ck_to_kcck(%source: tensor<128x256xf32>, %dest: tensor<16x4x32x16xf32>) -> tensor<128x256xf32> { - %0 = tensor.pack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<16x4x32x16xf32> - %1 = tensor.empty() : tensor<128x256xf32> - %2 = tensor.unpack %0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %1 : tensor<16x4x32x16xf32> -> tensor<128x256xf32> - return %2 : tensor<128x256xf32> -} - -// CHECK-LABEL: func.func @pack_ck_to_kcck( -// CHECK-SAME: %[[SOURCE:.*]]: tensor<128x256xf32>, -// CHECK-SAME: %[[DEST:.*]]: tensor<16x4x32x16xf32>) -// CHECK: %[[PACKED:.*]] = tensor.pack %[[SOURCE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[DEST]] : tensor<128x256xf32> -> tensor<16x4x32x16xf32> -// CHECK: %[[BUFF:.*]] = tensor.empty() : tensor<128x256xf32> -// CHECK: %{{.*}} = tensor.unpack %[[PACKED]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %[[BUFF]] : tensor<16x4x32x16xf32> -> tensor<128x256xf32> - -// ----- - -func.func @pad_and_pack_fully_dynamic(%source: tensor, %dest: tensor, %pad: f32, %tile_n : index, %tile_m : index) -> tensor { - %0 = tensor.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor - return %0 : tensor -} - -// CHECK-LABEL: func.func @pad_and_pack_fully_dynamic( -// CHECK-SAME: %[[SOURCE:.*]]: tensor, -// CHECK-SAME: %[[DEST:.*]]: tensor, -// CHECK-SAME: %[[PAD:.*]]: f32, -// CHECK-SAME: %[[TILE_N:.*]]: index, -// CHECK-SAME: %[[TILE_M:.*]]: index) -// CHECK: %{{.*}} = tensor.pack %[[SOURCE]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_N]], %[[TILE_M]]] into %[[DEST]] : tensor -> tensor - -// ----- - -func.func @pad_and_pack_partially_dynamic(%source: tensor, %dest: tensor, %pad: f32) -> tensor { - %0 = tensor.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor - return %0 : tensor -} - -// CHECK-LABEL: func.func @pad_and_pack_partially_dynamic( -// CHECK-SAME: %[[SOURCE:.*]]: tensor, -// CHECK-SAME: %[[DEST:.*]]: tensor, -// CHECK-SAME: %[[PAD:.*]]: f32) -// CHECK: %{{.*}} = tensor.pack %[[SOURCE]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor -> tensor - -// ----- - -func.func @unpack_fully_dynamic(%source: tensor, %dest: tensor, %tile_n : index, %tile_m : index) -> tensor { - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor - return %0 : tensor -} - -// CHECK-LABEL: func.func @unpack_fully_dynamic( -// CHECK-SAME: %[[SOURCE:.*]]: tensor, -// CHECK-SAME: %[[DEST:.*]]: tensor, -// CHECK-SAME: %[[TILE_N:.*]]: index, -// CHECK-SAME: %[[TILE_M:.*]]: index) -// CHECK: %{{.*}} = tensor.unpack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_N]], %[[TILE_M]]] into %[[DEST]] : tensor -> tensor - -// ----- - -func.func @unpack_partially_dynamic(%source: tensor, %dest: tensor) -> tensor { - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %dest : tensor -> tensor - return %0: tensor -} - -// CHECK-LABEL: func.func @unpack_partially_dynamic( -// CHECK-SAME: %[[SOURCE:.*]]: tensor, -// CHECK-SAME: %[[DEST:.*]]: tensor) -// CHECK: %{{.*}} = tensor.unpack %[[SOURCE]] inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %[[DEST]] : tensor -> tensor diff --git a/mlir/test/Dialect/Tensor/tiling.mlir b/mlir/test/Dialect/Tensor/tiling.mlir index e02ab06a9d533..3b3dc28e29a71 100644 --- a/mlir/test/Dialect/Tensor/tiling.mlir +++ b/mlir/test/Dialect/Tensor/tiling.mlir @@ -183,495 +183,3 @@ module attributes {transform.with_named_sequence} { transform.yield } } - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 32)> -// CHECK: func.func @NC_to_NCnc -// CHECK-SAME: %[[IN:.*]]: tensor<128x256xf32>, -// CHECK-SAME: %[[OUT:.*]]: tensor<4x8x32x32xf32>) -> tensor<4x8x32x32xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[RES0:.*]] = scf.for %[[N:.*]] = %[[C0]] to %[[C4]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor<4x8x32x32xf32>) { -// CHECK: %[[RES1:.+]] = scf.for %[[C:.*]] = %[[C0]] to %[[C8]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor<4x8x32x32xf32>) { -// CHECK-DAG: %[[IN_N:.+]] = affine.apply #[[MAP0]](%[[N]]) -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP0]](%[[C]]) -// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_N]], %[[IN_C]]] [64, 128] [1, 1] : tensor<128x256xf32> to tensor<64x128xf32> -// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[N]], %[[C]], 0, 0] [2, 4, 32, 32] [1, 1, 1, 1] : tensor<4x8x32x32xf32> to tensor<2x4x32x32xf32> -// CHECK: %[[SUB_RES:.*]] = tensor.pack -// CHECK-SAME: %[[SUB_IN]] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %[[SUB_OUT]] -// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] -// CHECK: scf.yield %[[INSERT]] : tensor<4x8x32x32xf32> -// CHECK: } -// CHECK: scf.yield %[[RES1:.*]] : tensor<4x8x32x32xf32> -// CHECK: } -// CHECK: return %[[RES0:.*]] : tensor<4x8x32x32xf32> -// CHECK: } -func.func @NC_to_NCnc(%arg0: tensor<128x256xf32>, %arg1: tensor<4x8x32x32xf32>) -> tensor<4x8x32x32xf32> { - %0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %arg1 : tensor<128x256xf32> -> tensor<4x8x32x32xf32> - return %0 : tensor<4x8x32x32xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 8)> -// CHECK: func.func @KC_to_CKkc -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK: scf.for %[[C:.+]] = %[[C0]] to %[[C32]] step %[[C2]] -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP0]](%[[C]]) -// CHECK: %[[INPUT_SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK-SAME: [0, %[[IN_C]]] [128, 16] -// CHECK: %[[OUTPUT_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[C]], 0, 0, 0] [2, 4, 32, 8] -// CHECK: tensor.pack -// CHECK-SAME: %[[INPUT_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] -// CHECK-SAME: into %[[OUTPUT_SLICE]] -func.func @KC_to_CKkc(%arg0: tensor<128x256xf32>, %arg1: tensor<32x4x32x8xf32>) -> tensor<32x4x32x8xf32> { - %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %arg1 : tensor<128x256xf32> -> tensor<32x4x32x8xf32> - return %0 : tensor<32x4x32x8xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * -2 + 15, 8)> -// CHECK: func.func @pad_and_pack_static( -// CHECK-SAME: %[[IN:.*]]: tensor<13x15xf32>, -// CHECK-SAME: %[[OUT:.*]]: tensor<2x8x8x2xf32>, -// CHECK-SAME: %[[PAD:.*]]: f32) -> tensor<2x8x8x2xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index -// CHECK-DAG: %[[RES0:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[C8]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[OUT]]) -> (tensor<2x8x8x2xf32>) { -// CHECK-DAG: %[[IN_J:.*]] = affine.apply #[[MAP0]](%[[J]]) -// CHECK-DAG: %[[IN_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]]) -// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][0, %[[IN_J]]] [13, %[[IN_J_SZ]]] [1, 1] -// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][0, %[[J]], 0, 0] [2, 4, 8, 2] [1, 1, 1, 1] -// CHECK: %[[SUB_RES:.*]] = tensor.pack -// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] -// CHECK-SAME: into %[[SUB_OUT]] -// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] -// CHECK: scf.yield %[[INSERT]] : tensor<2x8x8x2xf32> -// CHECK: } -// CHECK: return %[[RES0:.*]] : tensor<2x8x8x2xf32> -// CHECK: } -func.func @pad_and_pack_static(%input: tensor<13x15xf32>, %output: tensor<2x8x8x2xf32>, %pad: f32) -> tensor<2x8x8x2xf32> { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<13x15xf32> -> tensor<2x8x8x2xf32> - return %0 : tensor<2x8x8x2xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 * 8)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -8 + s0, d0 * 8)> -// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0, d1)[s0] -> (d1 * -2 + s0, d0 * 2)> -// CHECK: func.func @pad_and_pack_partially_dynamic( -// CHECK-SAME: %[[IN:.*]]: tensor, -// CHECK-SAME: %[[OUT:.*]]: tensor, -// CHECK-SAME: %[[PAD:.*]]: f32) -> tensor { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[OUT_D0:.*]] = tensor.dim %[[OUT]], %[[C0]] : tensor -// CHECK-DAG: %[[OUT_D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor -// CHECK: %[[RES0:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[OUT_D0]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor) { -// CHECK: %[[RES1:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[OUT_D1]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor) { -// CHECK-DAG: %[[OUT_I_SZ:.*]] = affine.min #[[MAP0]](%[[I]])[%[[OUT_D0]]] -// CHECK-DAG: %[[OUT_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]])[%[[OUT_D1]]] -// CHECK-DAG: %[[IN_I:.*]] = affine.apply #[[MAP2]](%[[I]]) -// CHECK-DAG: %[[IN_I_SZ:.*]] = affine.min #[[MAP3]] -// CHECK-DAG: %[[IN_J:.*]] = affine.apply #[[MAP4]](%[[J]]) -// CHECK-DAG: %[[IN_J_SZ:.*]] = affine.min #[[MAP5]] -// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_I]], %[[IN_J]]] [%[[IN_I_SZ]], %[[IN_J_SZ]]] [1, 1] : tensor to tensor -// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[I]], %[[J]], 0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]], 8, 2] [1, 1, 1, 1] : tensor to tensor -// CHECK: %[[SUB_RES:.*]] = tensor.pack -// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] -// CHECK-SAME: into %[[SUB_OUT]] -// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[SUB_RES]] into %[[ITER1]] -// CHECK: scf.yield %[[INSERT]] : tensor -// CHECK: } -// CHECK: scf.yield %[[RES1:.*]] : tensor -// CHECK: } -// CHECK: return %[[VAL_34:.*]] : tensor -// CHECK: } -func.func @pad_and_pack_partially_dynamic(%input: tensor, %output: tensor, %pad: f32) -> tensor { - %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor -> tensor - return %0 : tensor -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (d0 * s0)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s0, -(d1 * s0) + s1)> -// CHECK: func.func @pad_and_pack_fully_dynamic( -// CHECK-SAME: %[[IN:.*]]: tensor, -// CHECK-SAME: %[[OUT:.*]]: tensor, -// CHECK-SAME: %[[PAD:.*]]: f32, -// CHECK-SAME: %[[TILE_0:.*]]: index, -// CHECK-SAME: %[[TILE_1:.*]]: index) -> tensor { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[OUT_D0:.*]] = tensor.dim %[[OUT]], %[[C0]] : tensor -// CHECK-DAG: %[[OUT_D1:.*]] = tensor.dim %[[OUT]], %[[C1]] : tensor -// CHECK: %[[RES0:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[OUT_D0]] step %[[C2]] iter_args(%[[ITER0:.*]] = %[[OUT]]) -> (tensor) { -// CHECK: %[[RES1:.*]] = scf.for %[[J:.*]] = %[[C0]] to %[[OUT_D1]] step %[[C4]] iter_args(%[[ITER1:.*]] = %[[ITER0]]) -> (tensor) { -// CHECK-DAG: %[[OUT_I_SZ:.*]] = affine.min #[[MAP0]](%[[I]])[%[[OUT_D0]]] -// CHECK-DAG: %[[OUT_J_SZ:.*]] = affine.min #[[MAP1]](%[[J]])[%[[OUT_D1]]] -// CHECK-DAG: %[[IN_D0:.*]] = tensor.dim %[[IN]], %[[C0]] -// CHECK-DAG: %[[IN_D1:.*]] = tensor.dim %[[IN]], %[[C1]] -// CHECK: %[[IN_I:.*]] = affine.apply #[[MAP2]](%[[I]])[%[[TILE_0]]] -// CHECK: %[[IN_I_SZ:.*]] = affine.min #[[MAP3]](%[[OUT_I_SZ]], %[[I]])[%[[TILE_0]], %[[IN_D0]]] -// CHECK: %[[IN_J:.*]] = affine.apply #[[MAP2]](%[[J]])[%[[TILE_1]]] -// CHECK: %[[IN_J_SZ:.*]] = affine.min #[[MAP3]](%[[OUT_J_SZ]], %[[J]])[%[[TILE_1]], %[[IN_D1]]] -// CHECK: %[[SUB_IN:.*]] = tensor.extract_slice %[[IN]][%[[IN_I]], %[[IN_J]]] [%[[IN_I_SZ]], %[[IN_J_SZ]]] [1, 1] : tensor to tensor -// CHECK: %[[OUT_D2:.+]] = tensor.dim %[[ITER1]], %[[C2]] -// CHECK: %[[OUT_D3:.+]] = tensor.dim %[[ITER1]], %[[C3]] -// CHECK: %[[SUB_OUT:.*]] = tensor.extract_slice %[[ITER1]][%[[I]], %[[J]], 0, 0] [%[[OUT_I_SZ]], %[[OUT_J_SZ]], %[[OUT_D2]], %[[OUT_D3]]] [1, 1, 1, 1] : tensor to tensor -// CHECK: %[[PACK:.*]] = tensor.pack -// CHECK-SAME: %[[SUB_IN]] padding_value(%[[PAD]] : f32) inner_dims_pos = [0, 1] inner_tiles = [%[[TILE_0]], %[[TILE_1]]] -// CHECK-SAME: into %[[SUB_OUT]] -// CHECK: %[[INSERT:.*]] = tensor.insert_slice %[[PACK]] into %[[ITER1]] -// CHECK: scf.yield %[[INSERT]] : tensor -// CHECK: } -// CHECK: scf.yield %[[RES1:.*]] : tensor -// CHECK: } -// CHECK: return %[[RES0:.*]] : tensor -// CHECK: } -func.func @pad_and_pack_fully_dynamic(%source: tensor, %dest: tensor, %pad: f32, %tile_n : index, %tile_m : index) -> tensor { - %0 = tensor.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor - return %0 : tensor -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)> -// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 16)> -// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 16)> -// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 16 - d0 floordiv 16 + 1)> -// CHECK: func.func @NCnc_to_NC -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index -// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index -// CHECK: %{{.+}} = scf.for %[[I:.+]] = %[[C0]] to %[[C256]] step %[[C2]] -// CHECK: %{{.+}} = scf.for %[[J:.+]] = %[[C0]] to %[[C128]] step %[[C4]] -// CHECK-DAG: %[[IN_I:.+]] = affine.apply #[[MAP0]](%[[I]]) -// CHECK-DAG: %[[OFFSET_I:.+]] = affine.apply #[[MAP1]](%[[I]]) -// CHECK-DAG: %[[IN_I_SZ:.+]] = affine.apply #[[MAP2]](%[[I]]) -// CHECK-DAG: %[[IN_J:.+]] = affine.apply #[[MAP4]](%[[J]]) -// CHECK-DAG: %[[OFFSET_J:.+]] = affine.apply #[[MAP5]](%[[J]]) -// CHECK-DAG: %[[IN_J_SZ:.+]] = affine.apply #[[MAP6]](%[[J]]) -// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK-SAME: [%[[IN_I]], %[[IN_J]], 0, 0] [%[[IN_I_SZ]], %[[IN_J_SZ]], 32, 16] -// CHECK-SAME: : tensor<8x8x32x16xf32> to tensor -// CHECK: %[[EMPTY:.+]] = tensor.empty -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[SLICE]] inner_dims_pos = [0, 1] inner_tiles = [32, 16] -// CHECK-SAME: into %[[EMPTY]] -// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] -// CHECK-SAME: [%[[OFFSET_I]], %[[OFFSET_J]]] [2, 4] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]] -// CHECK-SAME: into %{{.+}}[%[[I]], %[[J]]] [2, 4] -// CHECK: scf.yield %[[RES]] -func.func @NCnc_to_NC(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { - %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> - return %0 : tensor<256x128xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 32)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 mod 32)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> ((d0 + 1) floordiv 32 - d0 floordiv 32 + 1)> -// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0) -> (d0 floordiv 8)> -// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0) -> (d0 mod 8)> -// CHECK-DAG: #[[MAP6:.+]] = affine_map<(d0) -> ((d0 + 3) floordiv 8 - d0 floordiv 8 + 1)> -// CHECK: func.func @CKkc_to_KC -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index -// CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index -// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C128]] step %[[C2]] -// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C256]] step %[[C4]] -// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]]) -// CHECK-DAG: %[[OFFSET_K:.+]] = affine.apply #[[MAP1]](%[[K]]) -// CHECK-DAG: %[[IN_K_SZ:.+]] = affine.apply #[[MAP2]](%[[K]]) -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP4]](%[[C]]) -// CHECK-DAG: %[[OFFSET_C:.+]] = affine.apply #[[MAP5]](%[[C]]) -// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP6]](%[[C]]) -// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], %[[IN_K_SZ]], 32, 8] -// CHECK: %[[EMPTY:.+]] = tensor.empty -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] -// CHECK-SAME: into %[[EMPTY]] -// CHECK: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] -// CHECK-SAME: [%[[OFFSET_K]], %[[OFFSET_C]]] [2, 4] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK_SLICE]] -// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4] -// CHECK: scf.yield %[[RES]] -func.func @CKkc_to_KC(%source: tensor<32x4x32x8xf32>, %dest: tensor<128x256xf32>) -> tensor<128x256xf32> { - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 8] into %dest : tensor<32x4x32x8xf32> -> tensor<128x256xf32> - return %0 : tensor<128x256xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0) -> (d0 floordiv 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 floordiv 4)> -// CHECK: func.func @perfect_CKkc_to_KC -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index -// CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index -// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C2]] -// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[C128]] step %[[C4]] -// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP0]](%[[K]]) -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP1]](%[[C]]) -// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [1, 1, 2, 4] -// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [2, 4] -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] -// CHECK-SAME: into %[[ITER_SLICE]] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] -// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [2, 4] -// CHECK: scf.yield %[[RES]] -func.func @perfect_CKkc_to_KC(%source: tensor<32x4x2x4xf32>, %dest: tensor<8x128xf32>) -> tensor<8x128xf32> { - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 4] into %dest : tensor<32x4x2x4xf32> -> tensor<8x128xf32> - return %0 : tensor<8x128xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0) -> (d0 floordiv 2)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0) -> (d0 ceildiv 2)> -// CHECK: func.func @dynamic_perfect_CKkc_to_KC -// CHECK-SAME: %[[IN:[A-Za-z0-9]+]]: -// CHECK-SAME: %[[OUT:[A-Za-z0-9]+]]: -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[DIM_0:.+]] = tensor.dim %[[OUT]], %[[C0]] -// CHECK-DAG: %[[DIM_1:.+]] = tensor.dim %[[OUT]], %[[C1]] -// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[DIM_0]] step %[[C2]] -// CHECK: %{{.+}} = scf.for %[[C:.+]] = %[[C0]] to %[[DIM_1]] step %[[C4]] -// CHECK-DAG: %[[OUT_K_SZ:.+]] = affine.min #[[MAP0]](%[[K]])[%[[DIM_0]]] -// CHECK-DAG: %[[OUT_C_SZ:.+]] = affine.min #[[MAP1]](%[[C]])[%[[DIM_1]]] -// CHECK-DAG: %[[IN_K:.+]] = affine.apply #[[MAP2]](%[[K]]) -// CHECK-DAG: %[[IN_C:.+]] = affine.apply #[[MAP2]](%[[C]]) -// CHECK-DAG: %[[IN_C_SZ:.+]] = affine.apply #[[MAP3]](%[[OUT_C_SZ]]) -// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[IN]] -// CHECK: [%[[IN_C]], %[[IN_K]], 0, 0] [%[[IN_C_SZ]], 1, 2, 2] -// CHECK: %[[ITER_SLICE:.+]] = tensor.extract_slice %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[IN_SLICE]] outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] -// CHECK-SAME: into %[[ITER_SLICE]] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] -// CHECK-SAME: into %{{.+}}[%[[K]], %[[C]]] [%[[OUT_K_SZ]], %[[OUT_C_SZ]]] -// CHECK: scf.yield %[[RES]] - -func.func @dynamic_perfect_CKkc_to_KC(%source: tensor, %dest: tensor) -> tensor { - %0 = tensor.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %dest : tensor -> tensor - return %0 : tensor -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [2, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK: #[[MAP:.+]] = affine_map<(d0) -> (d0 floordiv 2)> -// CHECK: func.func @perfect_NKPQk_to_NPQK( -// CHECK-SAME: %[[SOURCE:.+]]: tensor<1x4x6x6x2xf32>, -// CHECK-SAME: %{{.+}}: tensor<1x6x6x8xf32>) -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index -// CHECK: %{{.+}} = scf.for %[[P:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %{{.+}} = scf.for %[[Q:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %{{.+}} = scf.for %[[K:.+]] = %[[C0]] to %[[C8]] step %[[C4]] -// CHECK: %[[K_SZ:.+]] = affine.apply #[[MAP]](%[[K]]) -// CHECK: %[[SLICE_SOURCE:.+]] = tensor.extract_slice %[[SOURCE]][0, %[[K_SZ]], %[[P]], %[[Q]], 0] -// CHECK: %[[SLICE_DEST:.+]] = tensor.extract_slice %{{.+}}[0, %[[P]], %[[Q]], %[[K]]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack -// CHECK-SAME: %[[SLICE_SOURCE]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] -// CHECK-SAME: into %[[SLICE_DEST]] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[UNPACK]] -// CHECK-SAME: into %{{.+}}[0, %[[P]], %[[Q]], %[[K]]] -// CHECK: scf.yield %[[RES]] - -func.func @perfect_NKPQk_to_NPQK(%source: tensor<1x4x6x6x2xf32>, %dest: tensor<1x6x6x8xf32>) -> tensor<1x6x6x8xf32> { - %0 = tensor.unpack %source outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %dest : tensor<1x4x6x6x2xf32> -> tensor<1x6x6x8xf32> - return %0 : tensor<1x6x6x8xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -func.func private @get_dynamic_tile_size() -> index - -// CHECK-LABEL: func.func @fully_dynamic_unpack -// CHECK-SAME: %[[SRC:[0-9a-zA-Z]+]] -// CHECK-SAME: %[[DST:[0-9a-zA-Z]+]] -// CHECK: %[[INNER_TS:.+]] = call @get_dynamic_tile_size() : () -> index -// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[DST]]) -// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[SRC]] -// CHECK: %[[EMPTY:.+]] = tensor.empty -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[SLICE]] -// CHECK-SAME: inner_dims_pos = [1, 0] inner_tiles = [%[[INNER_TS]], %[[INNER_TS]]] into %[[EMPTY]] -func.func @fully_dynamic_unpack(%source: tensor, %dest: tensor) -> tensor { - %0 = func.call @get_dynamic_tile_size() : () -> index - %1 = tensor.unpack %source inner_dims_pos = [1, 0] inner_tiles = [%0, %0] into %dest : tensor -> tensor - return %1 : tensor -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [4, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} - -// ----- - -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0) -> (d0 * 2)> -// CHECK: func.func @perfect_NPQK_to_NKPQk -// CHECK-SAME: %[[SOURCE:.+]]: tensor<1x6x6x8xf32>, -// CHECK-SAME: %{{.+}}: tensor<1x4x6x6x2xf32>) -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index -// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index -// CHECK: %{{.+}} = scf.for %[[ARG2:.+]] = %[[C0]] to %[[C4]] step %[[C1]] -// CHECK: %{{.+}} = scf.for %[[ARG4:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %{{.+}} = scf.for %[[ARG6:.+]] = %[[C0]] to %[[C6]] step %[[C1]] -// CHECK: %[[APPLY:.+]] = affine.apply #[[MAP1]](%[[ARG2]]) -// CHECK: %[[SLICE_SOURCE:.+]] = tensor.extract_slice %[[SOURCE]][0, %[[ARG4]], %[[ARG6]], %[[APPLY]]] -// CHECK: %[[SLICE_DEST:.+]] = tensor.extract_slice %{{.+}}[0, %[[ARG2]], %[[ARG4]], %[[ARG6]], 0] -// CHECK: %[[PACK:.+]] = tensor.pack -// CHECK-SAME: %[[SLICE_SOURCE]] outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] -// CHECK-SAME: into %[[SLICE_DEST]] -// CHECK: %[[RES:.+]] = tensor.insert_slice %[[PACK]] -// CHECK-SAME: into %{{.+}}[0, %[[ARG2]], %[[ARG4]], %[[ARG6]], 0] -// CHECK: scf.yield %[[RES]] - -func.func @perfect_NPQK_to_NKPQk(%source: tensor<1x6x6x8xf32>, %dest: tensor<1x4x6x6x2xf32>) -> tensor<1x4x6x6x2xf32> { - %0 = tensor.pack %source outer_dims_perm = [0, 3, 1, 2] inner_dims_pos = [3] inner_tiles = [2] into %dest : tensor<1x6x6x8xf32> -> tensor<1x4x6x6x2xf32> - return %0 : tensor<1x4x6x6x2xf32> -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %0 = transform.structured.match ops{["tensor.pack"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loops:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - transform.yield - } -} diff --git a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir index 6705905633e0f..dfc79a19e6cc6 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-permutation-lowering.mlir @@ -1,5 +1,8 @@ // RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s +// TODO: Review the usage of `in_bounds` and remove where not affecting the +// generated output. + /// CHECK: #[[$MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d1, 0, d3)> ///---------------------------------------------------------------------------------------- @@ -106,8 +109,8 @@ func.func @xfer_write_minor_identity_transposed_map_masked( /// (neither a minor identity nor transposed minor identity map) /// OUT 1: vector.broadcast + vector.transfer_write /// (transposed minor identity) -/// OUT 2: vector.transfer_write -> vector.broadcast + vector.transpose + vector.transfer_write -/// (minor identity) +/// OUT 2: vector.transfer_write -> vector.broadcast + vector.transpose +/// + vector.transfer_write (minor identity) ///---------------------------------------------------------------------------------------- // CHECK-LABEL: func.func @xfer_write_non_minor_identity( @@ -233,16 +236,16 @@ func.func @xfer_write_non_minor_identity_masked_scalable( // CHECK-LABEL: func @xfer_write_non_minor_identity_masked_2 // CHECK-SAME: %[[DEST:.*]]: tensor // CHECK-SAME: %[[VEC:.*]]: vector<14x8x16xf32> -// CHECK-SAME: %[[DIM:.*]]: index, %[[IDX:.*]]: index) -> tensor +// CHECK-SAME: %[[MASK:.*]]: vector<14x8x16xi1> +// CHECK-SAME: %[[DIM:.*]]: index // CHECK-NOT: vector.broadcast -// CHECK: vector.mask %0 { vector.transfer_write %[[VEC]], %[[DEST]]{{.*}} : vector<14x8x16xf32>, tensor } : vector<14x8x16xi1> -> tensor +// CHECK: vector.mask %[[MASK]] { vector.transfer_write %[[VEC]], %[[DEST]]{{.*}} : vector<14x8x16xf32>, tensor } : vector<14x8x16xi1> -> tensor func.func @xfer_write_non_minor_identity_masked_2( %dest : tensor, %vec : vector<14x8x16xf32>, - %dim : index, + %mask: vector<14x8x16xi1>, %idx: index) -> tensor { - %mask = vector.create_mask %dim, %dim, %dim : vector<14x8x16xi1> %res = vector.mask %mask { vector.transfer_write %vec, %dest[%idx, %idx, %idx, %idx] { in_bounds = [false, false, true], @@ -259,29 +262,27 @@ func.func @xfer_write_non_minor_identity_masked_2( /// /// IN: vector.transfer_read /// (_transposed_ minor identity permutation map, with 0 or more broadcast dims) -/// OUT: vector.transpose + vector.transfer_write +/// OUT: vector.transfer_read + vector.broadcast + vector.transpose /// (minor identity permutation map with 0 or more leading broadcast dims) ///---------------------------------------------------------------------------------------- /// TODO: Inner broadcast dim - see also the block at the bottom of this file -// CHECK-LABEL: func.func @xfer_read_minor_identity_tranposed_with_mask +// CHECK-LABEL: func.func @xfer_read_minor_identity_transposed_with_mask // CHECK-SAME: %[[MEM:.*]]: memref, -// CHECK-SAME: %[[DIM_1:.*]]: index, %[[DIM_2:.*]]: index, %[[IDX:.*]]: index) -> vector<8x4x2xf32> { +// CHECK-SAME: %[[MASK:.*]]: vector<2x4xi1> +// CHECK-SAME: %[[IDX:.*]]: index // CHECK: %[[PASS_THROUGH:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_2]], %[[DIM_1]] : vector<2x4xi1> // CHECK: %[[T_READ:.*]] = vector.transfer_read %[[MEM]]{{\[}}%[[IDX]], %[[IDX]]], %[[PASS_THROUGH]], %[[MASK]] {in_bounds = [true, true]} : memref, vector<2x4xf32> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[T_READ]] : vector<2x4xf32> to vector<8x2x4xf32> // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[BCAST]], [0, 2, 1] : vector<8x2x4xf32> to vector<8x4x2xf32> // CHECK: return %[[TRANSPOSE]] : vector<8x4x2xf32> -func.func @xfer_read_minor_identity_tranposed_with_mask( +func.func @xfer_read_minor_identity_transposed_with_mask( %mem: memref, - %dim_1: index, - %dim_2: index, + %mask: vector<2x4xi1>, %idx: index) -> (vector<8x4x2xf32>) { %pad = arith.constant 0.000000e+00 : f32 - %mask = vector.create_mask %dim_2, %dim_1 : vector<2x4xi1> %res = vector.transfer_read %mem[%idx, %idx], %pad, %mask { in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)> @@ -290,24 +291,22 @@ func.func @xfer_read_minor_identity_tranposed_with_mask( return %res : vector<8x4x2xf32> } -// CHECK-LABEL: func.func @xfer_read_minor_identity_tranposed_with_mask_scalable( +// CHECK-LABEL: func.func @xfer_read_minor_identity_transposed_with_mask_scalable( // CHECK-SAME: %[[MEM:.*]]: memref, -// CHECK-SAME: %[[DIM_1:.*]]: index, %[[DIM_2:.*]]: index, %[[IDX:.*]]: index) -> vector<8x[4]x2xf32> { +// CHECK-SAME: %[[MASK:.*]]: vector<2x[4]xi1> +// CHECK-SAME: %[[IDX:.*]]: index // CHECK: %[[PAD:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[MASK:.*]] = vector.create_mask %[[DIM_2]], %[[DIM_1]] : vector<2x[4]xi1> // CHECK: %[[T_READ:.*]] = vector.transfer_read %[[MEM]]{{\[}}%[[IDX]], %[[IDX]]], %[[PAD]], %[[MASK]] {in_bounds = [true, true]} : memref, vector<2x[4]xf32> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[T_READ]] : vector<2x[4]xf32> to vector<8x2x[4]xf32> // CHECK: %[[TRANSPOSE:.*]] = vector.transpose %[[BCAST]], [0, 2, 1] : vector<8x2x[4]xf32> to vector<8x[4]x2xf32> // CHECK: return %[[TRANSPOSE]] : vector<8x[4]x2xf32> -func.func @xfer_read_minor_identity_tranposed_with_mask_scalable( +func.func @xfer_read_minor_identity_transposed_with_mask_scalable( %mem: memref, - %dim_1: index, - %dim_2: index, + %mask: vector<2x[4]xi1>, %idx: index) -> (vector<8x[4]x2xf32>) { %pad = arith.constant 0.000000e+00 : f32 - %mask = vector.create_mask %dim_2, %dim_1 : vector<2x[4]xi1> %res = vector.transfer_read %mem[%idx, %idx], %pad, %mask { in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (0, d1, d0)> @@ -319,24 +318,26 @@ func.func @xfer_read_minor_identity_tranposed_with_mask_scalable( // Masked version is not supported // CHECK-LABEL: func @xfer_read_minor_identity_transposed_masked( -// CHECK-SAME: %[[DEST:.*]]: tensor, -// CHECK-SAME: %[[MASK:.*]]: vector<4x1xi1> +// CHECK-SAME: %[[DEST:.*]]: tensor, +// CHECK-SAME: %[[MASK:.*]]: vector<2x4xi1> +// CHECK-SAME: %[[IDX:.*]]: index // CHECK-NOT: vector.transpose -// CHECK: vector.mask %[[MASK]] { vector.transfer_read %[[DEST]]{{.*}}: tensor, vector<1x4x4xf32> } : vector<4x1xi1> -> vector<1x4x4xf32> +// CHECK: vector.mask %[[MASK]] { vector.transfer_read %[[DEST]]{{.*}}: tensor, vector<8x4x2xf32> } : vector<2x4xi1> -> vector<8x4x2xf32> func.func @xfer_read_minor_identity_transposed_masked( - %dest: tensor, - %mask : vector<4x1xi1>, - %idx: index) { + %dest: tensor, + %mask: vector<2x4xi1>, + %idx: index) -> (vector<8x4x2xf32>) { %pad = arith.constant 0.000000e+00 : f32 - %3 = vector.mask %mask { + + %res = vector.mask %mask { vector.transfer_read %dest[%idx, %idx], %pad { - permutation_map = affine_map<(d0, d1) -> (d1, 0, d0)> - } : tensor, vector<1x4x4xf32> - } : vector<4x1xi1> -> vector<1x4x4xf32> + in_bounds = [true, true, true], + permutation_map = affine_map<(d0, d1) -> (0, d1, d0)> + } : tensor, vector<8x4x2xf32> + } : vector<2x4xi1> -> vector<8x4x2xf32> - "test.some_use"(%3) : (vector<1x4x4xf32>) -> () - return + return %res : vector<8x4x2xf32> } // CHECK-LABEL: func.func @xfer_read_minor_identity_transposed_masked_scalable( @@ -346,7 +347,7 @@ func.func @xfer_read_minor_identity_transposed_masked( // CHECK: %[[T_READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[DEST]]{{.*}} : tensor, vector<8x[4]x2xf32> } : vector<2x[4]xi1> -> vector<8x[4]x2xf32> func.func @xfer_read_minor_identity_transposed_masked_scalable( %dest: tensor, - %mask : vector<2x[4]xi1>, + %mask: vector<2x[4]xi1>, %idx: index) -> vector<8x[4]x2xf32> { %pad = arith.constant 0.000000e+00 : f32 @@ -388,17 +389,16 @@ func.func @xfer_read_minor_identitiy_bcast_dims_scalable( // CHECK-LABEL: func.func @xfer_read_minor_identitiy_bcast_dims_masked // CHECK-SAME: %[[MEM:.*]]: memref, -// CHECK-SAME: %[[DIM:.*]]: index, +// CHECK-SAME: %[[MASK:.*]]: vector<[4]x3xi1> // CHECK-SAME: %[[IDX:.*]]: index) -> vector<8x[4]x2x3xf32> { // CHECK-NOT: vector.broadcast -// CHECK: %[[MASK:.*]] = vector.mask %0 { vector.transfer_read %[[MEM]]{{.*}} : memref, vector<8x[4]x2x3xf32> } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32> +// CHECK: vector.mask %[[MASK]] { vector.transfer_read %[[MEM]]{{.*}} : memref, vector<8x[4]x2x3xf32> } : vector<[4]x3xi1> -> vector<8x[4]x2x3xf32> func.func @xfer_read_minor_identitiy_bcast_dims_masked( %mem: memref, - %dim: index, + %mask: vector<[4]x3xi1>, %idx: index) -> vector<8x[4]x2x3xf32> { %pad = arith.constant 0.000000e+00 : f32 - %mask = vector.create_mask %dim, %dim: vector<[4]x3xi1> %res = vector.mask %mask { vector.transfer_read %mem[%idx, %idx, %idx, %idx], %pad { diff --git a/mlir/test/Dialect/X86Vector/cvt-packed-f32-to-bf16.mlir b/mlir/test/Dialect/X86Vector/cvt-packed-f32-to-bf16.mlir new file mode 100644 index 0000000000000..c97c52f01c3b0 --- /dev/null +++ b/mlir/test/Dialect/X86Vector/cvt-packed-f32-to-bf16.mlir @@ -0,0 +1,24 @@ +// REQUIRES: target=x86{{.*}} + +// RUN: mlir-opt %s \ +// RUN: -convert-vector-to-llvm="enable-x86vector" -convert-to-llvm \ +// RUN: -reconcile-unrealized-casts | \ +// RUN: mlir-translate --mlir-to-llvmir | \ +// RUN: llc -mcpu=sapphirerapids | \ +// RUN: FileCheck %s + +func.func @avx512bf16_cvt_packed_f32_to_bf16_256( + %a: vector<8xf32>) -> vector<8xbf16> { + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<8xf32> -> vector<8xbf16> + return %0 : vector<8xbf16> +} +// CHECK-LABEL: avx512bf16_cvt_packed_f32_to_bf16_256: +// CHECK: vcvtneps2bf16{{.*}}%xmm + +func.func @avx512bf16_cvt_packed_f32_to_bf16_512( + %a: vector<16xf32>) -> vector<16xbf16> { + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<16xf32> -> vector<16xbf16> + return %0 : vector<16xbf16> +} +// CHECK-LABEL: avx512bf16_cvt_packed_f32_to_bf16_512: +// CHECK: vcvtneps2bf16{{.*}}%ymm diff --git a/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir b/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir index ed9177eaec9ce..59be7dd75b3b0 100644 --- a/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir +++ b/mlir/test/Dialect/X86Vector/legalize-for-llvm.mlir @@ -70,6 +70,24 @@ func.func @avx512bf16_dot_512(%src: vector<16xf32>, %a: vector<32xbf16>, return %0 : vector<16xf32> } +// CHECK-LABEL: func @avx512bf16_cvt_packed_f32_to_bf16_256 +func.func @avx512bf16_cvt_packed_f32_to_bf16_256( + %a: vector<8xf32>) -> (vector<8xbf16>) +{ + // CHECK: x86vector.avx512.intr.cvtneps2bf16.256 + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<8xf32> -> vector<8xbf16> + return %0 : vector<8xbf16> +} + +// CHECK-LABEL: func @avx512bf16_cvt_packed_f32_to_bf16_512 +func.func @avx512bf16_cvt_packed_f32_to_bf16_512( + %a: vector<16xf32>) -> (vector<16xbf16>) +{ + // CHECK: x86vector.avx512.intr.cvtneps2bf16.512 + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<16xf32> -> vector<16xbf16> + return %0 : vector<16xbf16> +} + // CHECK-LABEL: func @avx_rsqrt func.func @avx_rsqrt(%a: vector<8xf32>) -> (vector<8xf32>) { diff --git a/mlir/test/Dialect/X86Vector/roundtrip.mlir b/mlir/test/Dialect/X86Vector/roundtrip.mlir index cf74a7ee60255..0d00448c63da8 100644 --- a/mlir/test/Dialect/X86Vector/roundtrip.mlir +++ b/mlir/test/Dialect/X86Vector/roundtrip.mlir @@ -74,6 +74,26 @@ func.func @avx512bf16_dot_512(%src: vector<16xf32>, %a: vector<32xbf16>, return %0 : vector<16xf32> } +// CHECK-LABEL: func @avx512bf16_cvt_packed_f32_to_bf16_256 +func.func @avx512bf16_cvt_packed_f32_to_bf16_256( + %a: vector<8xf32>) -> (vector<8xbf16>) +{ + // CHECK: x86vector.avx512.cvt.packed.f32_to_bf16 {{.*}} : + // CHECK-SAME: vector<8xf32> -> vector<8xbf16> + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<8xf32> -> vector<8xbf16> + return %0 : vector<8xbf16> +} + +// CHECK-LABEL: func @avx512bf16_cvt_packed_f32_to_bf16_512 +func.func @avx512bf16_cvt_packed_f32_to_bf16_512( + %a: vector<16xf32>) -> (vector<16xbf16>) +{ + // CHECK: x86vector.avx512.cvt.packed.f32_to_bf16 {{.*}} : + // CHECK-SAME: vector<16xf32> -> vector<16xbf16> + %0 = x86vector.avx512.cvt.packed.f32_to_bf16 %a : vector<16xf32> -> vector<16xbf16> + return %0 : vector<16xbf16> +} + // CHECK-LABEL: func @avx_rsqrt func.func @avx_rsqrt(%a: vector<8xf32>) -> (vector<8xf32>) { diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir index a0fd3f7d87083..bca94d4a64416 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-scalable-inner-tile.mlir @@ -22,7 +22,7 @@ // RUN: rm -f %t && %{compile} && %{run} | FileCheck %s -/// End-to-end test for tensor.pack where one of the inner tile sizes is +/// End-to-end test for linalg.pack where one of the inner tile sizes is /// scalable. func.func @main() { @@ -60,7 +60,7 @@ func.func private @pack(%A: tensor<7x16xi32>) { %A_pack_empty = tensor.empty(%c1, %tile_size) : tensor - %A_pack = tensor.pack %A + %A_pack = linalg.pack %A padding_value(%pad_val : i32) inner_dims_pos = [0, 1] inner_tiles = [%tile_size, 1] @@ -117,9 +117,9 @@ func.func private @pack(%A: tensor<7x16xi32>) { module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op - // 1. Tile so that we can decompose tensor.pack into tensor.pad and other + // 1. Tile so that we can decompose linalg.pack into tensor.pad and other // Ops (see step 2) %tiled_pack_op_p, %loops:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir index 15edae8b6d3f8..a8daa0b855d00 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-dynamic-inner-tile.mlir @@ -8,7 +8,7 @@ // RUN: rm -f %t && %{compile} && %{run} | FileCheck %s -/// End-to-end test for tensor.pack where one of the inner tile sizes is +/// End-to-end test for linalg.pack where one of the inner tile sizes is /// dynamic. func.func @main() { @@ -38,7 +38,7 @@ func.func private @pack(%A: tensor<7x16xi32>) { %tile_size = arith.constant 8 : index %A_pack_empty = tensor.empty(%c1, %tile_size) : tensor - %A_pack = tensor.pack %A + %A_pack = linalg.pack %A padding_value(%pad_val : i32) inner_dims_pos = [0, 1] inner_tiles = [%tile_size, 1] @@ -78,9 +78,9 @@ func.func private @pack(%A: tensor<7x16xi32>) { module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) { - %pack = transform.structured.match ops{["tensor.pack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.pack"]} in %module : (!transform.any_op) -> !transform.any_op - // 1. Tile so that we can decompose tensor.pack into tensor.pad and other + // 1. Tile so that we can decompose linalg.pack into tensor.pad and other // Ops (see step 2) %tiled_pack_op_p, %loops:2 = transform.structured.tile_using_for %pack tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir index 63622d761bc5b..05e678227de32 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/pack-unpack-mmt4d.mlir @@ -12,9 +12,9 @@ /// End-to-end test for computing matrix-multiplication using linalg.mmt4d. In /// particular, demonstrates how the following MLIR sequence (implemented in @mmt4d): /// -/// A_pack = tensor.pack A -/// B_pack = tensor.pack B -/// C_pack = tensor.pack C +/// A_pack = linalg.pack A +/// B_pack = linalg.pack B +/// C_pack = linalg.pack C /// out_pack = linalg.mmt4d(A_pack, B_pack, C_pack) /// /// is equivalent to: @@ -86,16 +86,16 @@ func.func private @mmt4d(%A: tensor<7x16xi32>, %B: tensor<16x13xi32>, %C: tensor %C_pack_empty = tensor.empty() : tensor<2x2x8x8xi32> // Pack matrices - %A_pack = tensor.pack %A padding_value(%zero : i32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %A_pack_empty : tensor<7x16xi32> -> tensor<2x16x8x1xi32> - %B_pack = tensor.pack %B padding_value(%zero : i32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 1] into %B_pack_empty : tensor<16x13xi32> -> tensor<2x16x8x1xi32> - %C_pack = tensor.pack %C padding_value(%zero : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_pack_empty : tensor<7x13xi32> -> tensor<2x2x8x8xi32> + %A_pack = linalg.pack %A padding_value(%zero : i32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %A_pack_empty : tensor<7x16xi32> -> tensor<2x16x8x1xi32> + %B_pack = linalg.pack %B padding_value(%zero : i32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [8, 1] into %B_pack_empty : tensor<16x13xi32> -> tensor<2x16x8x1xi32> + %C_pack = linalg.pack %C padding_value(%zero : i32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_pack_empty : tensor<7x13xi32> -> tensor<2x2x8x8xi32> // MMT4D %mmt4d = linalg.mmt4d ins(%A_pack, %B_pack : tensor<2x16x8x1xi32>, tensor<2x16x8x1xi32>) outs(%C_pack : tensor<2x2x8x8xi32>) -> tensor<2x2x8x8xi32> // Unpack output %C_out_empty = tensor.empty() : tensor<7x13xi32> - %C_out_unpack = tensor.unpack %mmt4d outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_out_empty : tensor<2x2x8x8xi32> -> tensor<7x13xi32> + %C_out_unpack = linalg.unpack %mmt4d outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %C_out_empty : tensor<2x2x8x8xi32> -> tensor<7x13xi32> return %C_out_unpack : tensor<7x13xi32> } @@ -146,16 +146,16 @@ module @transforms attributes { transform.with_named_sequence } { transform.apply_patterns.canonicalization } : !transform.op<"func.func"> - // Step 4. Lower tensor.pack - %pack = transform.structured.match ops{["tensor.pack"]} in %func_h - : (!transform.op<"func.func">) -> !transform.op<"tensor.pack"> - transform.structured.lower_pack %pack : (!transform.op<"tensor.pack">) + // Step 4. Lower linalg.pack + %pack = transform.structured.match ops{["linalg.pack"]} in %func_h + : (!transform.op<"func.func">) -> !transform.op<"linalg.pack"> + transform.structured.lower_pack %pack : (!transform.op<"linalg.pack">) -> (!transform.op<"tensor.pad">, !transform.op<"tensor.expand_shape">, !transform.op<"linalg.transpose">) - // Step 5. Lower tensor.unpack - %unpack = transform.structured.match ops{["tensor.unpack"]} in %func_h - : (!transform.op<"func.func">) -> !transform.op<"tensor.unpack"> - transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">) + // Step 5. Lower linalg.unpack + %unpack = transform.structured.match ops{["linalg.unpack"]} in %func_h + : (!transform.op<"func.func">) -> !transform.op<"linalg.unpack"> + transform.structured.lower_unpack %unpack : (!transform.op<"linalg.unpack">) -> (!transform.op<"tensor.empty">, !transform.op<"linalg.transpose">, !transform.op<"tensor.collapse_shape">, diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/unpack-dynamic-inner-tile.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/unpack-dynamic-inner-tile.mlir index 4395dfe74914e..c5360ee1ec954 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/unpack-dynamic-inner-tile.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/unpack-dynamic-inner-tile.mlir @@ -8,7 +8,7 @@ // RUN: rm -f %t && %{compile} && %{run} | FileCheck %s -/// End-to-end test for tensor.unpack where one of the inner tile sizes is +/// End-to-end test for linalg.unpack where one of the inner tile sizes is /// dynamic. func.func @main() { @@ -56,7 +56,7 @@ func.func private @unpack(%A: tensor) { %tile_size = arith.constant 8 : index %A_unpack_empty = tensor.empty() : tensor<7x3xi32> - %A_unpack = tensor.unpack %A + %A_unpack = linalg.unpack %A inner_dims_pos = [0, 1] inner_tiles = [%tile_size, 1] into %A_unpack_empty : tensor -> tensor<7x3xi32> @@ -78,9 +78,9 @@ func.func private @unpack(%A: tensor) { module @transforms attributes { transform.with_named_sequence } { transform.named_sequence @__transform_main(%module: !transform.any_op {transform.consume}) { - %pack = transform.structured.match ops{["tensor.unpack"]} in %module : (!transform.any_op) -> !transform.any_op + %pack = transform.structured.match ops{["linalg.unpack"]} in %module : (!transform.any_op) -> !transform.any_op - // 1. Tile so that we can decompose tensor.pack + // 1. Tile so that we can decompose linalg.pack // Ops (see step 2) %c8 = transform.param.constant 8 : i64 -> !transform.param %tiled_pack_op_p, %loops:2 = transform.structured.tile_using_for %pack tile_sizes [%c8, 1] diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir index 4b901289d1a4b..53a7282e1f141 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/dot.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir index 828e498543a9f..8376464cee42d 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/inline-asm-vector-avx512.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-linalg-to-loops -convert-vector-to-scf='full-unroll=true' -lower-affine -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1' -convert-arith-to-llvm -reconcile-unrealized-casts |\ +// RUN: mlir-opt %s -convert-linalg-to-loops -convert-vector-to-scf='full-unroll=true' -test-lower-to-llvm |\ // RUN: mlir-translate --mlir-to-llvmir |\ // RUN: %lli --entry-function=entry --mattr="avx512f" --dlopen=%mlir_c_runner_utils |\ // RUN: FileCheck %s @@ -37,4 +37,3 @@ module { llvm.return %i0 : i32 } } - diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/mask-compress.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/mask-compress.mlir index f1d7caeb4f3da..eda9138d222a0 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/mask-compress.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/mask-compress.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx512bw" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir index 225f9963aeeea..6cc4e6ca69fe3 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/rsqrt.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir index f665891536ada..bf1caaafa3ff4 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/sparse-dot-product.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx512bw,avx512vp2intersect" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir index 2eccf00f221a7..46124c2ba87c4 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/X86Vector/vp2intersect-i32.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-x86vector" -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-vector-to-llvm="enable-x86vector" -test-lower-to-llvm | \ // RUN: mlir-translate --mlir-to-llvmir | \ // RUN: %lli --entry-function=entry --mattr="avx512bw,avx512vp2intersect" --dlopen=%mlir_c_runner_utils | \ // RUN: FileCheck %s diff --git a/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir new file mode 100644 index 0000000000000..34dde6e03c80e --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/command-line-arg.mlir @@ -0,0 +1,21 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 ptxas-cmd-options='-v --register-usage-level=8'" -debug-only=serialize-to-binary \ +// RUN: 2>&1 | FileCheck %s + +func.func @host_function(%arg0 : f32, %arg1 : memref) { + %cst = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst2 = memref.dim %arg1, %c0 : memref + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst, %grid_y = %cst, %grid_z = %cst) + threads(%tx, %ty, %tz) in (%block_x = %cst2, %block_y = %cst, %block_z = %cst) { + memref.store %arg0, %arg1[%tx] : memref + gpu.terminator + } + + return +} + +// CHECK: ptxas -arch sm_80 +// CHECK-SAME: -v +// CHECK-SAME: --register-usage-level=8 diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index 2d35be403ef99..8ce05d94c4ad0 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -211,7 +211,7 @@ module { linalg.yield %7, %8 : f32, f32 } -> (tensor<64x64xf32>, tensor<64x64xf32>) %5 = tensor.empty() : tensor<2048xf32> - %unpack = tensor.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32> + %unpack = linalg.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32> return %4#1, %unpack : tensor<64x64xf32>, tensor<2048xf32> } } @@ -254,7 +254,7 @@ module attributes {transform.with_named_sequence} { // CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#1 into %[[ELEM_OUT_ARG_1]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } -// CHECK: %[[UNPACK:.*]] = tensor.unpack %[[FINAL_RESULT]]#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %{{.*}} : tensor<64x32xf32> -> tensor<2048xf32> +// CHECK: %[[UNPACK:.*]] = linalg.unpack %[[FINAL_RESULT]]#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %{{.*}} : tensor<64x32xf32> -> tensor<2048xf32> // CHECK: return %[[FINAL_RESULT]]#3, %[[UNPACK]] : // ----- @@ -278,7 +278,7 @@ module { } } %output = tensor.empty() : tensor<2048xf32> - %unpack = tensor.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32> + %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32> return %unpack : tensor<2048xf32> } } @@ -308,7 +308,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[UNPACK_RESULT_OFFSET:.*]] = affine.apply #[[UNPACK_RESULT_OFFSET_MAP]](%[[IV1]]) // CHECK-DAG: %[[UNPACK_RESULT_SIZE:.*]] = affine.min #[[UNPACK_RESULT_SIZE_MAP]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [%[[UNPACK_RESULT_SIZE]]] [1] -// CHECK: %[[TILED_UNPACK_OUT:.*]] = tensor.unpack %[[GENERIC_OUT]] +// CHECK: %[[TILED_UNPACK_OUT:.*]] = linalg.unpack %[[GENERIC_OUT]] // CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { @@ -339,7 +339,7 @@ module { } } %output = tensor.empty() : tensor<2047xf32> - %unpack = tensor.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32> + %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32> return %unpack : tensor<2047xf32> } } @@ -369,7 +369,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[UNPACK_RESULT_OFFSET:.*]] = affine.apply #[[UNPACK_RESULT_OFFSET_MAP]](%[[IV1]]) // CHECK-DAG: %[[UNPACK_RESULT_SIZE:.*]] = affine.min #[[UNPACK_RESULT_SIZE_MAP]](%[[IV1]]) // CHECK: %[[TILED_UNPACK_DEST:.*]] = tensor.extract_slice %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [%[[UNPACK_RESULT_SIZE]]] [1] -// CHECK: %[[TILED_UNPACK_OUT:.*]] = tensor.unpack %[[GENERIC_OUT]] +// CHECK: %[[TILED_UNPACK_OUT:.*]] = linalg.unpack %[[GENERIC_OUT]] // CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { @@ -400,7 +400,7 @@ module { } } %output = tensor.empty() : tensor<4x32x16xf32> - %pack = tensor.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> + %pack = linalg.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> return %pack : tensor<4x32x16xf32> } } @@ -428,7 +428,7 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: outs(%[[GENERIC_OUT_SLICE]] : // CHECK: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV1]]) // CHECK: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] -// CHECK: %[[TILED_PACK_OUT:.*]] = tensor.pack %[[GENERIC_OUT]] +// CHECK: %[[TILED_PACK_OUT:.*]] = linalg.pack %[[GENERIC_OUT]] // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [16] // CHECK-SAME: into %[[TILED_PACK_DEST]] // CHECK: scf.forall.in_parallel { diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir index 1cf6664f3df24..8a0390a4379cf 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir @@ -591,7 +591,7 @@ module attributes {transform.with_named_sequence} { // ----- func.func @imperfect_unpack_producer_fusion(%source: tensor<1x1x288x8x4xf32>, %dest: tensor<1x2x1152xf32>) -> tensor<1x2x1152xf32> { - %0 = tensor.unpack %source + %0 = linalg.unpack %source outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 4] into %dest @@ -625,7 +625,7 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[ARG1:.+]]: tensor<1x2x1152xf32> // CHECK: %[[FOR_RESULT:.+]] = scf.for{{.*}}iter_args(%[[ITER_ARG:.+]] = {{.*}}) // CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]] -// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[SLICE]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SLICE]] // CHECK-DAG: %[[UNPACK_SLICE:.+]] = tensor.extract_slice %[[UNPACK]] // CHECK-DAG: %[[INIT_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]] // CHECK: %[[GENERIC:.+]] = linalg.generic diff --git a/mlir/test/Target/LLVMIR/Import/alias.ll b/mlir/test/Target/LLVMIR/Import/alias.ll index 9f86da3ecc71c..3ab68a7d8fb81 100644 --- a/mlir/test/Target/LLVMIR/Import/alias.ll +++ b/mlir/test/Target/LLVMIR/Import/alias.ll @@ -12,7 +12,7 @@ entry: ret ptr null } -; ----- +; // ----- @zed = global i32 42 @foo = alias i32, ptr @zed @@ -27,7 +27,7 @@ entry: ; CHECK: llvm.return %[[ADDR]] : !llvm.ptr ; CHECK: } -; ----- +; // ----- @v1 = global i32 0 @a3 = alias i32, addrspacecast (ptr @v1 to ptr addrspace(2)) @@ -37,7 +37,7 @@ entry: ; CHECK: llvm.return %[[CASTED_ADDR]] : !llvm.ptr<2> ; CHECK: } -; ----- +; // ----- @some_name = constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr null] } @vtable = alias { [3 x ptr] }, ptr @some_name @@ -47,7 +47,7 @@ entry: ; CHECK: llvm.return %[[ADDR]] : !llvm.ptr ; CHECK: } -; ----- +; // ----- @glob.private = private constant [32 x i32] zeroinitializer @glob = linkonce_odr hidden alias [32 x i32], inttoptr (i64 add (i64 ptrtoint (ptr @glob.private to i64), i64 1234) to ptr) @@ -60,7 +60,7 @@ entry: ; CHECK: %[[RET:.*]] = llvm.inttoptr %[[INTTOPTR]] : i64 to !llvm.ptr ; CHECK: llvm.return %[[RET]] : !llvm.ptr -; ----- +; // ----- @g1 = private global i32 0 @g2 = internal constant ptr @a1 diff --git a/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir index 621a206e18053..ece32bb5419c6 100644 --- a/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-depend-host-only.mlir @@ -25,8 +25,29 @@ module attributes {omp.is_target_device = false} { // CHECK: define void @omp_target_depend_() // CHECK-NOT: define {{.*}} @ // CHECK-NOT: call i32 @__tgt_target_kernel({{.*}}) +// CHECK: call void @__kmpc_omp_task_begin_if0 +// CHECK-NEXT: call void @.omp_target_task_proxy_func +// CHECK: call void @__kmpc_omp_task_complete_if0 +// https://github.com/llvm/llvm-project/issues/126949 exposes two issues +// 1. Empty target task proxy functions +// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice +// Once via the target task proxy function and a second time after the target task is done. +// The following checks check problem #2. +// functions. The following checks tests the fix for this issue. +// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]] +// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]: +// CHECK-NEXT: ret void + +// CHECK: define internal void @omp_target_depend_..omp_par // CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_depend__l[[LINE:.*]](ptr {{.*}}) +// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]] +// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]: // CHECK-NEXT: ret void + // CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_depend__l[[LINE]](ptr %[[ADDR_A:.*]]) // CHECK: store i32 100, ptr %[[ADDR_A]], align 4 + +// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949 +// CHECK: define internal void @.omp_target_task_proxy_func +// CHECK: call void @omp_target_depend_..omp_par diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir index 6b634226a3568..94d8d052d087e 100644 --- a/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir @@ -20,10 +20,30 @@ module attributes {omp.is_target_device = false} { // CHECK: define void @omp_target_nowait_() // CHECK-NOT: define {{.*}} @ // CHECK-NOT: call ptr @__kmpc_omp_target_task_alloc({{.*}}) +// CHECK: call void @__kmpc_omp_task_begin_if0 +// CHECK-NEXT: call void @.omp_target_task_proxy_func +// CHECK: call void @__kmpc_omp_task_complete_if0 +// https://github.com/llvm/llvm-project/issues/126949 exposes two issues +// 1. Empty target task proxy functions +// 2. When 1 fixed, it leads to a second problem of calling the omp target kernel twice +// Once via the target task proxy function and a second time after the target task is done. +// The following checks check problem #2. +// functions. The following checks tests the fix for this issue. +// CHECK-NEXT: br label %[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY:.*]] +// CHECK:[[BLOCK_AFTER_OUTLINED_TARGET_TASK_BODY]]: +// CHECK-NEXT: ret void + // Verify that we directly emit a call to the "target" region's body from the // parent function of the the `omp.target` op. +// CHECK: define internal void @omp_target_nowait_..omp_par // CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_nowait__l[[LINE:.*]](ptr {{.*}}) +// CHECK-NEXT: br label %[[BLOCK_AFTER_TARGET_TASK_BODY:.*]] +// CHECK: [[BLOCK_AFTER_TARGET_TASK_BODY]]: // CHECK-NEXT: ret void // CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_nowait__l[[LINE]](ptr %[[ADDR_X:.*]]) // CHECK: store float 5{{.*}}, ptr %[[ADDR_X]], align 4 + +// The following check test for the fix of problem #1 as described in https://github.com/llvm/llvm-project/issues/126949 +// CHECK: define internal void @.omp_target_task_proxy_func +// CHECK: call void @omp_target_nowait_..omp_par diff --git a/mlir/test/Target/LLVMIR/x86vector.mlir b/mlir/test/Target/LLVMIR/x86vector.mlir index 1df03f10c9321..db1c10cd5cd37 100644 --- a/mlir/test/Target/LLVMIR/x86vector.mlir +++ b/mlir/test/Target/LLVMIR/x86vector.mlir @@ -62,37 +62,57 @@ llvm.func @LLVM_x86_vp2intersect_q_512(%a: vector<8xi64>, %b: vector<8xi64>) // CHECK-LABEL: define <4 x float> @LLVM_x86_avx512bf16_dpbf16ps_128 llvm.func @LLVM_x86_avx512bf16_dpbf16ps_128( - %arg0: vector<4xf32>, %arg1: vector<8xbf16>, %arg2: vector<8xbf16> + %src: vector<4xf32>, %a: vector<8xbf16>, %b: vector<8xbf16> ) -> vector<4xf32> { // CHECK: call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128( - %0 = "x86vector.avx512.intr.dpbf16ps.128"(%arg0, %arg1, %arg2) + %0 = "x86vector.avx512.intr.dpbf16ps.128"(%src, %a, %b) : (vector<4xf32>, vector<8xbf16>, vector<8xbf16>) -> vector<4xf32> llvm.return %0 : vector<4xf32> } // CHECK-LABEL: define <8 x float> @LLVM_x86_avx512bf16_dpbf16ps_256 llvm.func @LLVM_x86_avx512bf16_dpbf16ps_256( - %arg0: vector<8xf32>, %arg1: vector<16xbf16>, %arg2: vector<16xbf16> + %src: vector<8xf32>, %a: vector<16xbf16>, %b: vector<16xbf16> ) -> vector<8xf32> { // CHECK: call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256( - %0 = "x86vector.avx512.intr.dpbf16ps.256"(%arg0, %arg1, %arg2) + %0 = "x86vector.avx512.intr.dpbf16ps.256"(%src, %a, %b) : (vector<8xf32>, vector<16xbf16>, vector<16xbf16>) -> vector<8xf32> llvm.return %0 : vector<8xf32> } // CHECK-LABEL: define <16 x float> @LLVM_x86_avx512bf16_dpbf16ps_512 llvm.func @LLVM_x86_avx512bf16_dpbf16ps_512( - %arg0: vector<16xf32>, %arg1: vector<32xbf16>, %arg2: vector<32xbf16> + %src: vector<16xf32>, %a: vector<32xbf16>, %b: vector<32xbf16> ) -> vector<16xf32> { // CHECK: call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512( - %0 = "x86vector.avx512.intr.dpbf16ps.512"(%arg0, %arg1, %arg2) + %0 = "x86vector.avx512.intr.dpbf16ps.512"(%src, %a, %b) : (vector<16xf32>, vector<32xbf16>, vector<32xbf16>) -> vector<16xf32> llvm.return %0 : vector<16xf32> } +// CHECK-LABEL: define <8 x bfloat> @LLVM_x86_avx512bf16_cvtneps2bf16_256 +llvm.func @LLVM_x86_avx512bf16_cvtneps2bf16_256( + %a: vector<8xf32>) -> vector<8xbf16> +{ + // CHECK: call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256( + %0 = "x86vector.avx512.intr.cvtneps2bf16.256"(%a) + : (vector<8xf32>) -> vector<8xbf16> + llvm.return %0 : vector<8xbf16> +} + +// CHECK-LABEL: define <16 x bfloat> @LLVM_x86_avx512bf16_cvtneps2bf16_512 +llvm.func @LLVM_x86_avx512bf16_cvtneps2bf16_512( + %a: vector<16xf32>) -> vector<16xbf16> +{ + // CHECK: call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512( + %0 = "x86vector.avx512.intr.cvtneps2bf16.512"(%a) + : (vector<16xf32>) -> vector<16xbf16> + llvm.return %0 : vector<16xbf16> +} + // CHECK-LABEL: define <8 x float> @LLVM_x86_avx_rsqrt_ps_256 llvm.func @LLVM_x86_avx_rsqrt_ps_256(%a: vector <8xf32>) -> vector<8xf32> { @@ -103,11 +123,11 @@ llvm.func @LLVM_x86_avx_rsqrt_ps_256(%a: vector <8xf32>) -> vector<8xf32> // CHECK-LABEL: define <8 x float> @LLVM_x86_avx_dp_ps_256 llvm.func @LLVM_x86_avx_dp_ps_256( - %arg0: vector<8xf32>, %arg1: vector<8xf32> + %a: vector<8xf32>, %b: vector<8xf32> ) -> vector<8xf32> { // CHECK: call <8 x float> @llvm.x86.avx.dp.ps.256( - %0 = llvm.mlir.constant(-1 : i8) : i8 - %1 = "x86vector.avx.intr.dp.ps.256"(%arg0, %arg1, %0) : (vector<8xf32>, vector<8xf32>, i8) -> vector<8xf32> + %c = llvm.mlir.constant(-1 : i8) : i8 + %1 = "x86vector.avx.intr.dp.ps.256"(%a, %b, %c) : (vector<8xf32>, vector<8xf32>, i8) -> vector<8xf32> llvm.return %1 : vector<8xf32> } diff --git a/mlir/test/Target/SPIRV/terminator.mlir b/mlir/test/Target/SPIRV/terminator.mlir index 065b68b9bdfbb..8338a575681f1 100644 --- a/mlir/test/Target/SPIRV/terminator.mlir +++ b/mlir/test/Target/SPIRV/terminator.mlir @@ -24,4 +24,10 @@ spirv.module Logical GLSL450 requires #spirv.vce { // CHECK-NOT: spirv.Unreachable spirv.Unreachable } + + // CHECK-LABEL: @kill + spirv.func @kill() -> () "None" { + // CHECK: spirv.Kill + spirv.Kill + } } diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir index 5133c14414c97..c1604e226a334 100644 --- a/mlir/test/Transforms/loop-invariant-code-motion.mlir +++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir @@ -1163,18 +1163,18 @@ func.func @speculate_ceildivsi_range( func.func @speculate_static_pack_and_unpack(%source: tensor<128x256xf32>, %dest: tensor<4x16x32x16xf32>, %lb: index, %ub: index, %step: index) { - // CHECK: tensor.pack + // CHECK: linalg.pack // CHECK-NEXT: scf.for scf.for %i = %lb to %ub step %step { - %packed = tensor.pack %source + %packed = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<128x256xf32> -> tensor<4x16x32x16xf32> } - // CHECK: tensor.unpack + // CHECK: linalg.unpack // CHECK-NEXT: scf.for scf.for %i = %lb to %ub step %step { - %unpacked = tensor.unpack %dest + %unpacked = linalg.unpack %dest inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %source : tensor<4x16x32x16xf32> -> tensor<128x256xf32> } @@ -1188,25 +1188,25 @@ func.func @speculate_dynamic_pack_and_unpack(%source: tensor, %tile_m: index, %tile_n: index, %pad: f32) { // CHECK: scf.for - // CHECK-NEXT: tensor.pack + // CHECK-NEXT: linalg.pack scf.for %i = %lb to %ub step %step { - %packed = tensor.pack %source + %packed = linalg.pack %source inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor } // CHECK: scf.for - // CHECK-NEXT: tensor.unpack + // CHECK-NEXT: linalg.unpack scf.for %i = %lb to %ub step %step { - %unpacked = tensor.unpack %dest + %unpacked = linalg.unpack %dest inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %source : tensor -> tensor } - // CHECK: tensor.pack + // CHECK: linalg.pack // CHECK-NEXT: scf.for scf.for %i = %lb to %ub step %step { - %packed = tensor.pack %source padding_value(%pad : f32) + %packed = linalg.pack %source padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%tile_n, %tile_m] into %dest : tensor -> tensor } diff --git a/mlir/test/Transforms/scf-loop-unroll.mlir b/mlir/test/Transforms/scf-loop-unroll.mlir index baf6b2970ac0e..0ef6ad15d4eb0 100644 --- a/mlir/test/Transforms/scf-loop-unroll.mlir +++ b/mlir/test/Transforms/scf-loop-unroll.mlir @@ -1,5 +1,6 @@ // RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=3" -split-input-file -canonicalize | FileCheck %s // RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=1" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-BY-1 +// RUN: mlir-opt %s --test-loop-unrolling="unroll-full=true" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-FULL // CHECK-LABEL: scf_loop_unroll_single func.func @scf_loop_unroll_single(%arg0 : f32, %arg1 : f32) -> f32 { @@ -56,3 +57,59 @@ func.func @scf_loop_unroll_factor_1_promote() -> () { // UNROLL-BY-1-NEXT: %[[C0:.*]] = arith.constant 0 : index // UNROLL-BY-1-NEXT: %{{.*}} = "test.foo"(%[[C0]]) : (index) -> i32 } + +// UNROLL-FULL-LABEL: func @scf_loop_unroll_full_single +// UNROLL-FULL-SAME: %[[ARG:.*]]: index) +func.func @scf_loop_unroll_full_single(%arg : index) -> index { + %0 = arith.constant 0 : index + %1 = arith.constant 1 : index + %2 = arith.constant 4 : index + %4 = scf.for %iv = %0 to %2 step %1 iter_args(%arg1 = %1) -> index { + %3 = arith.addi %arg1, %arg : index + scf.yield %3 : index + } + return %4 : index + // UNROLL-FULL: %[[C1:.*]] = arith.constant 1 : index + // UNROLL-FULL: %[[V0:.*]] = arith.addi %[[ARG]], %[[C1]] : index + // UNROLL-FULL: %[[V1:.*]] = arith.addi %[[V0]], %[[ARG]] : index + // UNROLL-FULL: %[[V2:.*]] = arith.addi %[[V1]], %[[ARG]] : index + // UNROLL-FULL: %[[V3:.*]] = arith.addi %[[V2]], %[[ARG]] : index + // UNROLL-FULL: return %[[V3]] : index +} + +// UNROLL-FULL-LABEL: func @scf_loop_unroll_full_outter_loops +// UNROLL-FULL-SAME: %[[ARG:.*]]: vector<4x4xindex>) +func.func @scf_loop_unroll_full_outter_loops(%arg0: vector<4x4xindex>) -> index { + %0 = arith.constant 0 : index + %1 = arith.constant 1 : index + %2 = arith.constant 4 : index + %6 = scf.for %arg1 = %0 to %2 step %1 iter_args(%it0 = %0) -> index { + %5 = scf.for %arg2 = %0 to %2 step %1 iter_args(%it1 = %it0) -> index { + %3 = vector.extract %arg0[%arg1, %arg2] : index from vector<4x4xindex> + %4 = arith.addi %3, %it1 : index + scf.yield %3 : index + } + scf.yield %5 : index + } + return %6 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index + // UNROLL-FULL: %[[C1:.*]] = arith.constant 1 : index + // UNROLL-FULL: %[[C4:.*]] = arith.constant 4 : index + // UNROLL-FULL: %[[SUM0:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%{{.*}} = %[[C0]]) + // UNROLL-FULL: %[[VAL:.*]] = vector.extract %[[ARG]][0, %[[IV]]] : index from vector<4x4xindex> + // UNROLL-FULL: scf.yield %[[VAL]] : index + // UNROLL-FULL: } + // UNROLL-FULL: %[[SUM1:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%{{.*}} = %[[SUM0]]) + // UNROLL-FULL: %[[VAL:.*]] = vector.extract %[[ARG]][1, %[[IV]]] : index from vector<4x4xindex> + // UNROLL-FULL: scf.yield %[[VAL]] : index + // UNROLL-FULL: } + // UNROLL-FULL: %[[SUM2:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%{{.*}} = %[[SUM1]]) + // UNROLL-FULL: %[[VAL:.*]] = vector.extract %[[ARG]][2, %[[IV]]] : index from vector<4x4xindex> + // UNROLL-FULL: scf.yield %[[VAL]] : index + // UNROLL-FULL: } + // UNROLL-FULL: %[[SUM3:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C4]] step %[[C1]] iter_args(%{{.*}} = %[[SUM2]]) + // UNROLL-FULL: %[[VAL:.*]] = vector.extract %[[ARG]][3, %[[IV]]] : index from vector<4x4xindex> + // UNROLL-FULL: scf.yield %[[VAL]] : index + // UNROLL-FULL: } + // UNROLL-FULL: return %[[SUM3]] : index +} diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp index fa2a27dcfa991..046b9a65f3359 100644 --- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp @@ -74,8 +74,9 @@ struct TestLinalgTransforms *this, "test-decompose-pad-tensor", llvm::cl::desc("Test transform pad tensor by copying with generic ops"), llvm::cl::init(false)}; + // TODO: This is not used - delete. Option testDecomposeTensorPackOp{ - *this, "test-decompose-tensor-pack", + *this, "test-decompose-linalg-pack", llvm::cl::desc("Test transform that generalizes pack ops into a sequence " "of tensor and Linalg ops"), llvm::cl::init(false)}; @@ -130,6 +131,14 @@ struct TestLinalgTransforms Option testDecomposeWinogradOps{ *this, "test-decompose-winograd-ops", llvm::cl::desc("Test decompose Winograd ops"), llvm::cl::init(false)}; + Option testFoldIntoPackAndUnpack{ + *this, "test-fold-into-pack-and-unpack", + llvm::cl::desc("Test folding ops into linalg.pack and linalg.unpack"), + llvm::cl::init(false)}; + Option testSimplifyPackUnpackPatterns{ + *this, "test-simplify-pack-unpack-patterns", + llvm::cl::desc("Test patterns to simplify linalg.pack and linalg.unpack"), + llvm::cl::init(false)}; }; } // namespace @@ -227,6 +236,18 @@ static void applyDecomposeWinogradOps(func::FuncOp funcOp) { (void)applyPatternsGreedily(funcOp, std::move(patterns)); } +static void applyFoldIntoPackAndUnpackPatterns(Operation *rootOp) { + RewritePatternSet patterns(rootOp->getContext()); + linalg::populateFoldIntoPackAndUnpackPatterns(patterns); + (void)applyPatternsGreedily(rootOp, std::move(patterns)); +} + +static void applySimplifyPackUnpackPatterns(Operation *rootOp) { + RewritePatternSet patterns(rootOp->getContext()); + linalg::populateSimplifyPackAndUnpackPatterns(patterns); + (void)applyPatternsGreedily(rootOp, std::move(patterns)); +} + /// Apply transformations specified as patterns. void TestLinalgTransforms::runOnOperation() { if (testPatterns) @@ -255,6 +276,11 @@ void TestLinalgTransforms::runOnOperation() { return applyWinogradConv2D(getOperation()); if (testDecomposeWinogradOps) return applyDecomposeWinogradOps(getOperation()); + Operation *rootOp = getOperation(); + if (testFoldIntoPackAndUnpack) + applyFoldIntoPackAndUnpackPatterns(rootOp); + if (testSimplifyPackUnpackPatterns) + applySimplifyPackUnpackPatterns(rootOp); } namespace mlir { diff --git a/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp index 8694a7f9bbd62..ced003305a7b8 100644 --- a/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp +++ b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp @@ -42,10 +42,11 @@ struct TestLoopUnrollingPass TestLoopUnrollingPass(const TestLoopUnrollingPass &) {} explicit TestLoopUnrollingPass(uint64_t unrollFactorParam, unsigned loopDepthParam, - bool annotateLoopParam) { + bool annotateLoopParam, bool unrollFullParam) { unrollFactor = unrollFactorParam; loopDepth = loopDepthParam; annotateLoop = annotateLoopParam; + unrollFull = unrollFactorParam; } void getDependentDialects(DialectRegistry ®istry) const override { @@ -63,8 +64,12 @@ struct TestLoopUnrollingPass op->setAttr("unrolled_iteration", b.getUI32IntegerAttr(i)); } }; - for (auto loop : loops) - (void)loopUnrollByFactor(loop, unrollFactor, annotateFn); + for (auto loop : loops) { + if (unrollFull) + (void)loopUnrollFull(loop); + else + (void)loopUnrollByFactor(loop, unrollFactor, annotateFn); + } } Option unrollFactor{*this, "unroll-factor", llvm::cl::desc("Loop unroll factor."), @@ -77,6 +82,9 @@ struct TestLoopUnrollingPass llvm::cl::init(false)}; Option loopDepth{*this, "loop-depth", llvm::cl::desc("Loop depth."), llvm::cl::init(0)}; + Option unrollFull{*this, "unroll-full", + llvm::cl::desc("Full unroll loops."), + llvm::cl::init(false)}; }; } // namespace diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp index 173bfd8955f2b..e435130c2a417 100644 --- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp +++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp @@ -77,11 +77,6 @@ struct TestTensorTransforms llvm::cl::desc("Test folding of expand_shape/collapse_shape"), llvm::cl::init(false)}; - Option testFoldIntoPackAndUnpack{ - *this, "test-fold-into-pack-and-unpack", - llvm::cl::desc("Test folding ops into tensor.pack and tensor.unpack"), - llvm::cl::init(false)}; - Option useForeach{ *this, "use-foreach", llvm::cl::desc( @@ -89,11 +84,6 @@ struct TestTensorTransforms "the extract_slice of collapse_shape pattern"), llvm::cl::init(false)}; - Option testSimplifyPackUnpackPatterns{ - *this, "test-simplify-pack-unpack-patterns", - llvm::cl::desc("Test patterns to simplify tensor.pack and tensor.unpack"), - llvm::cl::init(false)}; - Option testTrackingListener{ *this, "test-tracking-listener", llvm::cl::desc("Test tensor TrackingListener for the transform dialect"), @@ -113,12 +103,6 @@ static void applyBubbleUpExpandShapePatterns(Operation *rootOp) { (void)applyPatternsGreedily(rootOp, std::move(patterns)); } -static void applyFoldIntoPackAndUnpackPatterns(Operation *rootOp) { - RewritePatternSet patterns(rootOp->getContext()); - tensor::populateFoldIntoPackAndUnpackPatterns(patterns); - (void)applyPatternsGreedily(rootOp, std::move(patterns)); -} - static void applyFoldConstantExtractSlicePatterns(Operation *rootOp) { RewritePatternSet patterns(rootOp->getContext()); tensor::ControlConstantExtractSliceFusionFn controlFn = @@ -148,12 +132,6 @@ applyDropRedundantInsertSliceRankExpansionPatterns(Operation *rootOp) { (void)applyPatternsGreedily(rootOp, std::move(patterns)); } -static void applySimplifyPackUnpackPatterns(Operation *rootOp) { - RewritePatternSet patterns(rootOp->getContext()); - tensor::populateSimplifyPackAndUnpackPatterns(patterns); - (void)applyPatternsGreedily(rootOp, std::move(patterns)); -} - namespace { /// Base pattern to rewrite a `tensor.collapse_shape -> tensor.extract_slice`. /// The `tensor.extract_slice` is replaced by a loop or gather operation that @@ -387,8 +365,6 @@ static LogicalResult testTrackingListenerReplacements(Operation *rootOp) { void TestTensorTransforms::runOnOperation() { Operation *rootOp = getOperation(); - if (testSimplifyPackUnpackPatterns) - applySimplifyPackUnpackPatterns(rootOp); if (testFoldConstantExtractSlice) applyFoldConstantExtractSlicePatterns(rootOp); if (testFoldConsecutiveInsertExtractSlice) @@ -399,8 +375,6 @@ void TestTensorTransforms::runOnOperation() { applyReassociativeReshapeFoldingPatterns(rootOp); if (testBubbleUpExpandShapePatterns) applyBubbleUpExpandShapePatterns(rootOp); - if (testFoldIntoPackAndUnpack) - applyFoldIntoPackAndUnpackPatterns(rootOp); if (testRewriteExtractSliceWithTiledCollapseShape) { if (failed( applyRewriteExtractFromCollapseShapePatterns(rootOp, useForeach))) diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index 2252f5e7e97a7..29a281f096855 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -219,7 +219,8 @@ typedef kmp_uint32 kmp_uint; // stdarg handling #if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64 || KMP_ARCH_WASM) && \ - (KMP_OS_FREEBSD || KMP_OS_LINUX || KMP_OS_WASI) + (KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || KMP_OS_DRAGONFLY || \ + KMP_OS_LINUX || KMP_OS_WASI) typedef va_list *kmp_va_list; #define kmp_va_deref(ap) (*(ap)) #define kmp_va_addr_of(ap) (&(ap)) diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl index c5da28845eccf..fcc9fc7ecc483 100644 --- a/utils/bazel/configure.bzl +++ b/utils/bazel/configure.bzl @@ -172,10 +172,19 @@ def _llvm_configure_impl(repository_ctx): ) # Create a starlark file with the requested LLVM targets. - targets = repository_ctx.attr.targets + llvm_targets = repository_ctx.attr.targets repository_ctx.file( "llvm/targets.bzl", - content = "llvm_targets = " + str(targets), + content = "llvm_targets = " + str(llvm_targets), + executable = False, + ) + + # Create a starlark file with the requested BOLT targets. + bolt_targets = ["AArch64","X86","RISCV"] # Supported targets. + bolt_targets = [t for t in llvm_targets if t in bolt_targets] + repository_ctx.file( + "bolt/targets.bzl", + content = "bolt_targets = " + str(bolt_targets), executable = False, ) diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel index 187938783a550..a9a7cc59575a3 100644 --- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception load("@bazel_skylib//rules:expand_template.bzl", "expand_template") +load(":targets.bzl", "bolt_targets") package( default_visibility = ["//visibility:public"], @@ -16,6 +17,20 @@ genrule( cmd = "echo '#undef BOLT_REVISION' >> $@\n", ) +expand_template( + name = "target_config_def_gen", + out = "include/bolt/Core/TargetConfig.def", + substitutions = {"@BOLT_ENUM_TARGETS@": "\n".join( + ["BOLT_TARGET({})".format(target) for target in bolt_targets], + )}, + template = "include/bolt/Core/TargetConfig.def.in", +) + +cc_library( + name = "TargetConfig", + textual_hdrs = [":target_config_def_gen"], +) + cc_binary( name = "llvm-bolt-heatmap", srcs = glob([ @@ -24,6 +39,7 @@ cc_binary( deps = [ ":Profile", ":Rewrite", + ":TargetConfig", ":Utils", "//llvm:AllTargetsAsmParsers", "//llvm:AllTargetsDisassemblers", @@ -54,6 +70,7 @@ cc_binary( ":Profile", ":Rewrite", ":RuntimeLibs", + ":TargetConfig", ":TargetAArch64", ":TargetX86", ":Utils", diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 7fd7c8b438629..2aced96c112ef 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1048,6 +1048,7 @@ cc_library( ":frontend", ":lex", ":rewrite", + ":sema", ":serialization", "//llvm:Core", "//llvm:Support", diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl index 6e703d22e7756..fa616bcb9a8c9 100644 --- a/utils/bazel/llvm-project-overlay/llvm/config.bzl +++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl @@ -47,6 +47,7 @@ posix_defines = [ linux_defines = posix_defines + [ "_GNU_SOURCE", + "HAVE_GETAUXVAL=1", "HAVE_MALLINFO=1", "HAVE_SBRK=1", "HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC=1", diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h index 93695f8e26d27..3ef1d0c4b1651 100644 --- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h +++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h @@ -296,7 +296,7 @@ /* HAVE_PROC_PID_RUSAGE defined in Bazel */ -#define HAVE_GETAUXVAL 1 +/* HAVE_GETAUXVAL defined in Bazel */ /* Directly provide definitions here behind platform preprocessor definitions. * The preprocessor conditions are sufficient to handle all of the configuration diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 92aedac837197..53aca8ab042ad 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -7763,7 +7763,6 @@ td_library( name = "TensorOpsTdFiles", srcs = [ "include/mlir/Dialect/Tensor/IR/TensorBase.td", - "include/mlir/Dialect/Tensor/IR/TensorInterfaces.td", "include/mlir/Dialect/Tensor/IR/TensorOps.td", ], includes = ["include"], @@ -7813,23 +7812,6 @@ gentbl_cc_library( deps = [":TensorOpsTdFiles"], ) -gentbl_cc_library( - name = "TensorInterfacesIncGen", - tbl_outs = [ - ( - ["--gen-op-interface-decls"], - "include/mlir/Dialect/Tensor/IR/TensorInterfaces.h.inc", - ), - ( - ["--gen-op-interface-defs"], - "include/mlir/Dialect/Tensor/IR/TensorInterfaces.cpp.inc", - ), - ], - tblgen = ":mlir-tblgen", - td_file = "include/mlir/Dialect/Tensor/IR/TensorInterfaces.td", - deps = [":TensorOpsTdFiles"], -) - cc_library( name = "TensorDialect", srcs = [ @@ -7859,13 +7841,13 @@ cc_library( ":InferIntRangeInterface", ":InferTypeOpInterface", ":InliningUtils", + ":LinalgInterfaces", ":LoopLikeInterface", ":ParallelCombiningOpInterface", ":ShapedOpInterfaces", ":SideEffectInterfaces", ":SubsetOpInterface", ":Support", - ":TensorInterfacesIncGen", ":TensorOpsIncGen", ":TilingInterface", ":TransformDialectInterfaces", @@ -11206,6 +11188,23 @@ gentbl_cc_library( deps = [":LinalgOpsTdFiles"], ) +gentbl_cc_library( + name = "LinalgRelayoutOpsIncGen", + tbl_outs = [ + ( + ["-gen-op-decls"], + "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.h.inc", + ), + ( + ["-gen-op-defs"], + "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.cpp.inc", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td", + deps = [":LinalgOpsTdFiles"], +) + gentbl_cc_library( name = "LinalgEnumsIncGen", tbl_outs = [ @@ -11532,10 +11531,50 @@ cc_library( ], ) +cc_library( + name = "LinalgInterfaces", + srcs = [ + "include/mlir/Dialect/Linalg/IR/Linalg.h", + "lib/Dialect/Linalg/IR/LinalgInterfaces.cpp", + ], + hdrs = ["include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h"], + includes = ["include"], + deps = [ + ":AffineDialect", + ":ArithDialect", + ":ArithUtils", + ":BytecodeOpInterface", + ":ComplexDialect", + ":ControlFlowInterfaces", + ":CopyOpInterface", + ":DestinationStyleOpInterface", + ":DialectUtils", + ":IR", + ":InferTypeOpInterface", + ":LinalgEnumsIncGen", + ":LinalgInterfacesIncGen", + ":LinalgOpsIncGen", + ":LinalgRelayoutOpsIncGen", + ":LinalgStructuredOpsIncGen", + ":SideEffectInterfaces", + ":Support", + ":TilingInterface", + ":ViewLikeInterface", + "//llvm:Support", + ], +) + cc_library( name = "LinalgDialect", - srcs = glob(["lib/Dialect/Linalg/IR/*.cpp"]), - hdrs = glob(["include/mlir/Dialect/Linalg/IR/*.h"]), + srcs = [ + "lib/Dialect/Linalg/IR/LinalgDialect.cpp", + "lib/Dialect/Linalg/IR/LinalgOps.cpp", + "lib/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.cpp", + ], + hdrs = [ + "include/mlir/Dialect/Linalg/IR/Linalg.h", + "include/mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h", + ], includes = ["include"], deps = [ ":AffineDialect", @@ -11554,9 +11593,10 @@ cc_library( ":InferTypeOpInterface", ":InliningUtils", ":LinalgEnumsIncGen", - ":LinalgInterfacesIncGen", + ":LinalgInterfaces", ":LinalgNamedStructuredOpsYamlIncGen", ":LinalgOpsIncGen", + ":LinalgRelayoutOpsIncGen", ":LinalgStructuredOpsIncGen", ":MathDialect", ":MemRefDialect", @@ -11568,6 +11608,7 @@ cc_library( ":SubsetOpInterface", ":Support", ":TensorDialect", + ":TensorUtils", ":TilingInterface", ":ValueBoundsOpInterface", ":ViewLikeInterface", @@ -11599,6 +11640,7 @@ cc_library( ":IR", ":IndexDialect", ":LinalgDialect", + ":LinalgInterfaces", ":LinalgMatchOpsIncGen", ":LinalgTransformEnumsIncGen", ":LinalgTransformOpsIncGen", @@ -11710,6 +11752,7 @@ cc_library( ":IR", ":IndexDialect", ":LinalgDialect", + ":LinalgInterfaces", ":LinalgPassIncGen", ":LinalgStructuredOpsIncGen", ":LinalgUtils", diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index 3e6114abfc078..9b005b206a101 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -425,6 +425,7 @@ cc_library( "//mlir:LLVMDialect", "//mlir:LLVMIRToLLVMTranslation", "//mlir:LinalgDialect", + "//mlir:LinalgInterfaces", "//mlir:LoopLikeInterface", "//mlir:MemorySlotInterfaces", "//mlir:Pass", diff --git a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel index a55c6f50118dc..d0c9f56f81cb9 100644 --- a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel @@ -176,6 +176,7 @@ cc_test( "//mlir:ArithDialect", "//mlir:FuncDialect", "//mlir:IR", + "//mlir:LinalgDialect", "//mlir:Parser", "//mlir:SCFDialect", "//mlir:SideEffectInterfaces", @@ -211,6 +212,7 @@ cc_test( "//llvm:Support", "//llvm:TestingSupport", "//mlir:IR", + "//mlir:LinalgDialect", "//mlir:SPIRVBinaryUtils", "//mlir:SPIRVDeserialization", "//mlir:SPIRVDialect",